How to find the longest common substring pattern in different group of strings in SQL
August 14th, 2007 By Frank Zhou
The following SQL pattern can be used to find the longest common substring pattern in different group of strings.
create table t8 as
Select 1 as id, 10 as grp_id, 'afpOnef1' as in_str from dual
union all
select 2 as id, 10 as grp_id, '1Onez' as in_str from dual
union all
select 3 as id, 10 as grp_id, '91Onez' as in_str from dual
union all
select 4 as id, 20 as grp_id, 'bptwoj' as in_str from dual
union all
select 5 as id ,20 as grp_id, '9patwof' as in_str from dual
union all
select 6 as id ,20 as grp_id, '+patwo3' as in_str from dual
COLUMN common_pattern FORMAT A28
COLUMN IN_STR FORMAT A18
SELECT DISTINCT id, grp_id, IN_STR, new_str common_pattern
FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt, length(new_str) len,
MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
FROM (SELECT id, grp_id, IN_STR, new_str, cnt,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str
FROM(SELECT id, grp_id, IN_STR, rn, cnt,
REPLACE(SYS_CONNECT_BY_PATH (ch,','),',') new_str
FROM (SELECT id,grp_id, IN_STR, ch, cnt,
row_number( ) over (PARTITION BY
grp_id,id ORDER BY rn) rn
FROM (SELECT id, grp_id,IN_STR, rownum rn,
cnt,SUBSTR(IN_STR, LEVEL,1) ch
FROM(SELECT id, grp_id, in_str,
COUNT(id)OVER (PARTITION BY grp_id) cnt
FROM t8)
CONNECT BY PRIOR id = id
AND LEVEL <= LENGTH(IN_STR)
AND PRIOR dbms_random.string('P',10)
IS NOT NULL
)
)
CONNECT BY PRIOR id = id
AND PRIOR rn = rn -1
)
)
WHERE ct_str = cnt
)
WHERE max_len = len
ORDER BY GRP_ID, ID;
ID GRP_ID IN_STR COMMON_PATTERN
---------- ---------- ------------------ ----------------------------
1 10 afpOnef1 One
2 10 1Onez One
3 10 91Onez One
4 20 bptwoj two
5 20 9patwof two
6 20 +patwo3 two
6 rows selected.
