OraQA

Oracle Question and Answer

  • Do you have a solution to a problem? Do you have an unanswered question? Login and share it with the Oracle community. More...

Oracle News


Entries RSS feed

Comments RSS feed

How to find the longest common substring patterns starting from the string head/tail

March 13th, 2007 By Frank Zhou

The following SQL pattern can be used to find the longest common substring patterns starting from the head/tail of the strings.

create table t5 as
Select 1 as id, 10 as grp_id,  'Fiveafpf1Three' as in_str from dual
union all
select 2 as id, 10 as grp_id,  'Five123zThree' as in_str from dual
union all
select 3 as id, 10 as grp_id,  'Five91OnezThree' as in_str from dual
union all
select 4 as id, 20 as grp_id,  'nine123TEN' as in_str from dual
union all
select 5 as id ,20 as grp_id,  'nine345TEN' as in_str from dual
SQL> select * from t5;

        ID     GRP_ID IN_STR
---------- ---------- ------------------
         1         10 Fiveafpf1Three
         2         10 Five123zThree
         3         10 Five91OnezThree
         4         20 nine123TEN
         5         20 nine345TEN   

-----------------------------------------------------------------------10G SQL Solution---------------------------------------------------
SELECT id, grp_id, IN_STR,
       CASE WHEN INSTR(IN_STR,HEAD,1) =INSTR(IN_STR,TAIL,-1)
          THEN HEAD ||'**'
   ELSE HEAD ||'**'||TAIL END AS Head_Tail_Pattern
FROM
(SELECT id, grp_id, IN_STR,
        MAX(CASE WHEN FLAG = 'HEAD'
          THEN common_pattern END) AS HEAD,
 MAX(CASE WHEN FLAG = 'TAIL'
          THEN common_pattern END) AS TAIL
 FROM
(
 SELECT id, grp_id, IN_STR, new_str common_pattern , 'HEAD' AS Flag
   FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt,
                length(new_str) len,
                MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
           FROM (SELECT id, grp_id, IN_STR, new_str,
                    COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str,
      COUNT(DISTINCT id) OVER (PARTITION BY grp_id) cnt
                  FROM (SELECT id, grp_id,IN_STR, SUBSTR(IN_STR,1,LEVEL) new_str
                         FROM t5
                        CONNECT BY PRIOR id = id
                        AND LEVEL <= LENGTH(IN_STR)
                        AND PRIOR dbms_random.string('P',10) IS NOT NULL
                       )
                )
          WHERE ct_str = cnt
      )
  WHERE max_len = len
  UNION ALL
  SELECT id, grp_id, REVERSE(IN_STR) IN_STR,
         REVERSE(new_str) common_pattern, 'TAIL' AS Flag
    FROM (SELECT id, grp_id,IN_STR, new_str, ct_str, cnt,
          length(new_str) len,
             MAX(LENGTH(new_str)) OVER (PARTITION BY grp_id) max_len
            FROM (SELECT id, grp_id, IN_STR, new_str,
                   COUNT(DISTINCT id) OVER (PARTITION BY grp_id, new_str) ct_str,
     COUNT(DISTINCT id) OVER (PARTITION BY grp_id) cnt
                   FROM (SELECT id, grp_id,IN_STR, SUBSTR(IN_STR,1,LEVEL) new_str
                           FROM (SELECT id, grp_id, REVERSE(IN_STR) IN_STR
       FROM t5 )
                          CONNECT BY PRIOR id = id
                          AND LEVEL <= LENGTH(IN_STR)
                          AND PRIOR dbms_random.string('P',10) IS NOT NULL
                        )
                 )
           WHERE ct_str = cnt
        )
    WHERE max_len = len
 )
 GROUP BY GRP_ID, id, IN_STR
 )
ORDER BY GRP_ID, id;

        ID     GRP_ID IN_STR             HEAD_TAIL_PATTERN
---------- ---------- ------------------ --------------------------------
         1         10 Fiveafpf1Three     Five**Three
         2         10 Five123zThree      Five**Three
         3         10 Five91OnezThree    Five**Three
         4         20 nine123TEN         nine**TEN
         5         20 nine345TEN         nine**TEN                             

3 Responses to “How to find the longest common substring patterns starting from the string head/tail”

  1. Laurent Schneider Says:

    note that REVERSE is not a supported function

  2. Maxim Demenko Says:

    Well, here is another alternative (maybe not so elegant, but with supported packaged REVERSE function ) – using the table from previous example.

    WITH t as
            (SELECT id,
                    grp_id,
                    in_str,
                    min(in_str) over (partition by grp_id) ref_str,
                    utl_raw.cast_to_raw(in_str) raw_str,
                    utl_raw.cast_to_raw(min(in_str) over (partition by grp_id)) ref_raw_str,
                    utl_raw.reverse(utl_raw.cast_to_raw(in_str)) reverse_raw_str,
                    utl_raw.reverse(utl_raw.cast_to_raw(min(in_str) over (partition by grp_id))) reverse_ref_raw_str
            FROM    t5
            )
            ,
            t1 as
            (SELECT id,
                    grp_id,
                    in_str,
                    ref_raw_str,
                    reverse_ref_raw_str,
                    min(nullif(utl_raw.compare(ref_raw_str,raw_str),0)) over(partition by grp_id)                  -1 head_len,
                    min(nullif(utl_raw.compare(reverse_ref_raw_str,reverse_raw_str),0)) over (partition by grp_id) -1 tail_len
            FROM    t
            )
            ,
            t2 as
            (SELECT id,
                    grp_id,
                    in_str,
                    utl_raw.substr(ref_raw_str,1,head_len) raw_head,
                    utl_raw.substr(reverse_ref_raw_str,1,tail_len) raw_tail
            FROM    t1
            )
    SELECT  id,
            grp_id,
            in_str,
            utl_raw.cast_to_varchar2(raw_head)||'**'||utl_raw.cast_to_varchar2(utl_raw.reverse(raw_tail)) head_tail_pattern
    FROM    t2;
    
     ID  GRP_ID IN_STR               HEAD_TAIL_PATTERN
    --- ------- -------------------- --------------------
      1      10 Fiveafpf1Three       Five**Three
      2      10 Five123zThree        Five**Three
      3      10 Five91OnezThree      Five**Three
      4      20 nine123TEN           nine**TEN
      5      20 nine345TEN           nine**TEN
    
  3. newkid Says:
    WITH t AS (
    SELECT t5.*,MIN(LENGTH(in_str)) OVER(PARTITION BY grp_id) len FROM t5
    )
    SELECT DISTINCT id,grp_id,in_str
          ,MAX(CASE WHEN head_cnt=1 THEN head END) OVER(PARTITION BY grp_id)
          ,SUBSTR(MAX(CASE WHEN tail_cnt=1 THEN LPAD(rn,10)||tail END) OVER(PARTITION BY grp_id),11)
    FROM (SELECT t2.*,COUNT(DISTINCT head) OVER(PARTITION BY grp_id,rn) head_cnt,COUNT(DISTINCT tail) OVER(PARTITION BY grp_id,rn) tail_cnt
            FROM (
          SELECT t.*,rn,SUBSTR(in_str,1,rn) head,SUBSTR(in_str,-rn) tail
            FROM t,(SELECT ROWNUM rn FROM (SELECT MAX(len) m FROM t) CONNECT BY ROWNUM<=m)
           WHERE rn<=t.len
                 ) t2
          )
    WHERE head_cnt=1 OR tail_cnt=1;
    

Leave a Reply

You must be logged in to post a comment.

RSS feed for comments on this question