How to find the longest common substring pattern in different group of strings in SQL
August 14th, 2007 By Frank Zhou
The following SQL pattern can be used to find the longest common substring pattern in different group of strings.
create table t8 as
Select 1 as id, 10 as grp_id, 'afpOnef1' as in_str from dual
union all
select 2 as id, 10 as grp_id, '1Onez' as in_str from dual
union all
select 3 as id, 10 as grp_id, '91Onez' as in_str from dual
union all
select 4 as id, 20 as grp_id, 'bptwoj' as in_str from dual
union all
select 5 as id ,20 as grp_id, '9patwof' as in_str from dual
union all
select 6 as id ,20 as grp_id, '+patwo3' as in_str from dual
COLUMN common_pattern FORMAT A28
COLUMN IN_STR FORMAT A18
-----------------------------------------------------10G SQL Solution----------------------------------------------------
SELECT DISTINCT id, grp_id, IN_STR, new_str common_pattern
FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt, length(new_str) len,
MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
FROM (SELECT id, grp_id, IN_STR, new_str, cnt,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str
FROM(SELECT id, grp_id, IN_STR, rn, cnt,
REPLACE(SYS_CONNECT_BY_PATH (ch,','),',') new_str
FROM (SELECT id,grp_id, IN_STR, ch, cnt,
row_number( ) over (PARTITION BY
grp_id,id ORDER BY rn) rn
FROM (SELECT id, grp_id,IN_STR, rownum rn,
cnt,SUBSTR(IN_STR, LEVEL,1) ch
FROM(SELECT id, grp_id, in_str,
COUNT(id)OVER (PARTITION BY grp_id) cnt
FROM t8)
CONNECT BY PRIOR id = id
AND LEVEL <= LENGTH(IN_STR)
AND PRIOR dbms_random.string('P',10)
IS NOT NULL
)
)
CONNECT BY PRIOR id = id
AND PRIOR rn = rn -1
)
)
WHERE ct_str = cnt
)
WHERE max_len = len
ORDER BY GRP_ID, ID;
ID GRP_ID IN_STR COMMON_PATTERN
---------- ---------- ------------------ ----------------------------
1 10 afpOnef1 One
2 10 1Onez One
3 10 91Onez One
4 20 bptwoj two
5 20 9patwof two
6 20 +patwo3 two
6 rows selected.

July 1st, 2010 at 12:31 pm
The difference is, first I find all common characters, then use them to build common sub strings.
SELECT id,grp_id,in_str,w FROM (SELECT t.*,rank() OVER(PARTITION BY grp_id ORDER BY LENGTH(w) DESC) rnk FROM (SELECT t.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id,w) cnt_w FROM (SELECT t.*,REPLACE(SYS_CONNECT_BY_PATH(c,’/'),’/') w FROM (SELECT * FROM (SELECT t.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id,c) cnt_c FROM (SELECT t.*,rn,SUBSTR(in_str,rn,1) c FROM (SELECT t8.*,COUNT(*) OVER (PARTITION BY grp_id) cnt FROM t8) t ,(SELECT ROWNUM rn FROM DUAL CONNECT BY ROWNUM<=100) ) t WHERE c IS NOT NULL ) WHERE cnt=cnt_c ) t CONNECT BY id=PRIOR id AND rn = PRIOR rn +1 ) t ) t WHERE cnt = cnt_w ) WHERE rnk=1;