How to find the longest common substring patterns starting from the string head/tail
March 13th, 2007 By Frank Zhou
The following SQL pattern can be used to find the longest common substring patterns starting from the head/tail of the strings.
create table t5 as
Select 1 as id, 10 as grp_id, 'Fiveafpf1Three' as in_str from dual
union all
select 2 as id, 10 as grp_id, 'Five123zThree' as in_str from dual
union all
select 3 as id, 10 as grp_id, 'Five91OnezThree' as in_str from dual
union all
select 4 as id, 20 as grp_id, 'nine123TEN' as in_str from dual
union all
select 5 as id ,20 as grp_id, 'nine345TEN' as in_str from dual
SQL> select * from t5;
ID GRP_ID IN_STR
---------- ---------- ------------------
1 10 Fiveafpf1Three
2 10 Five123zThree
3 10 Five91OnezThree
4 20 nine123TEN
5 20 nine345TEN
-----------------------------------------------------------------------10G SQL Solution---------------------------------------------------
SELECT id, grp_id, IN_STR,
CASE WHEN INSTR(IN_STR,HEAD,1) =INSTR(IN_STR,TAIL,-1)
THEN HEAD ||'**'
ELSE HEAD ||'**'||TAIL END AS Head_Tail_Pattern
FROM
(SELECT id, grp_id, IN_STR,
MAX(CASE WHEN FLAG = 'HEAD'
THEN common_pattern END) AS HEAD,
MAX(CASE WHEN FLAG = 'TAIL'
THEN common_pattern END) AS TAIL
FROM
(
SELECT id, grp_id, IN_STR, new_str common_pattern , 'HEAD' AS Flag
FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt,
length(new_str) len,
MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
FROM (SELECT id, grp_id, IN_STR, new_str,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id) cnt
FROM (SELECT id, grp_id,IN_STR, SUBSTR(IN_STR,1,LEVEL) new_str
FROM t5
CONNECT BY PRIOR id = id
AND LEVEL <= LENGTH(IN_STR)
AND PRIOR dbms_random.string('P',10) IS NOT NULL
)
)
WHERE ct_str = cnt
)
WHERE max_len = len
UNION ALL
SELECT id, grp_id, REVERSE(IN_STR) IN_STR,
REVERSE(new_str) common_pattern, 'TAIL' AS Flag
FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt,
length(new_str) len,
MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
FROM (SELECT id, grp_id, IN_STR, new_str,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str,
COUNT(DISTINCT id) OVER (PARTITION BY grp_id) cnt
FROM (SELECT id, grp_id,IN_STR, SUBSTR(IN_STR,1,LEVEL) new_str
FROM (SELECT id, grp_id, REVERSE(IN_STR) IN_STR
FROM t5 )
CONNECT BY PRIOR id = id
AND LEVEL <= LENGTH(IN_STR)
AND PRIOR dbms_random.string('P',10) IS NOT NULL
)
)
WHERE ct_str = cnt
)
WHERE max_len = len
)
GROUP BY GRP_ID, id, IN_STR
)
ORDER BY GRP_ID, id;
ID GRP_ID IN_STR HEAD_TAIL_PATTERN
---------- ---------- ------------------ --------------------------------
1 10 Fiveafpf1Three Five**Three
2 10 Five123zThree Five**Three
3 10 Five91OnezThree Five**Three
4 20 nine123TEN nine**TEN
5 20 nine345TEN nine**TEN

March 21st, 2007 at 4:34 am
note that REVERSE is not a supported function
April 5th, 2007 at 12:08 pm
Well, here is another alternative (maybe not so elegant, but with supported packaged REVERSE function ) – using the table from previous example.
WITH t as (SELECT id, grp_id, in_str, min(in_str) over (partition by grp_id) ref_str, utl_raw.cast_to_raw(in_str) raw_str, utl_raw.cast_to_raw(min(in_str) over (partition by grp_id)) ref_raw_str, utl_raw.reverse(utl_raw.cast_to_raw(in_str)) reverse_raw_str, utl_raw.reverse(utl_raw.cast_to_raw(min(in_str) over (partition by grp_id))) reverse_ref_raw_str FROM t5 ) , t1 as (SELECT id, grp_id, in_str, ref_raw_str, reverse_ref_raw_str, min(nullif(utl_raw.compare(ref_raw_str,raw_str),0)) over(partition by grp_id) -1 head_len, min(nullif(utl_raw.compare(reverse_ref_raw_str,reverse_raw_str),0)) over (partition by grp_id) -1 tail_len FROM t ) , t2 as (SELECT id, grp_id, in_str, utl_raw.substr(ref_raw_str,1,head_len) raw_head, utl_raw.substr(reverse_ref_raw_str,1,tail_len) raw_tail FROM t1 ) SELECT id, grp_id, in_str, utl_raw.cast_to_varchar2(raw_head)||'**'||utl_raw.cast_to_varchar2(utl_raw.reverse(raw_tail)) head_tail_pattern FROM t2; ID GRP_ID IN_STR HEAD_TAIL_PATTERN --- ------- -------------------- -------------------- 1 10 Fiveafpf1Three Five**Three 2 10 Five123zThree Five**Three 3 10 Five91OnezThree Five**Three 4 20 nine123TEN nine**TEN 5 20 nine345TEN nine**TENJune 25th, 2010 at 12:19 pm
WITH t AS ( SELECT t5.*,MIN(LENGTH(in_str)) OVER(PARTITION BY grp_id) len FROM t5 ) SELECT DISTINCT id,grp_id,in_str ,MAX(CASE WHEN head_cnt=1 THEN head END) OVER(PARTITION BY grp_id) ,SUBSTR(MAX(CASE WHEN tail_cnt=1 THEN LPAD(rn,10)||tail END) OVER(PARTITION BY grp_id),11) FROM (SELECT t2.*,COUNT(DISTINCT head) OVER(PARTITION BY grp_id,rn) head_cnt,COUNT(DISTINCT tail) OVER(PARTITION BY grp_id,rn) tail_cnt FROM ( SELECT t.*,rn,SUBSTR(in_str,1,rn) head,SUBSTR(in_str,-rn) tail FROM t,(SELECT ROWNUM rn FROM (SELECT MAX(len) m FROM t) CONNECT BY ROWNUM<=m) WHERE rn<=t.len ) t2 ) WHERE head_cnt=1 OR tail_cnt=1;