OraQA

Oracle Question and Answer

  • Do you have a solution to a problem? Do you have an unanswered question? Login and share it with the Oracle community. More...

Oracle News


Entries RSS feed

Comments RSS feed

How to find the Longest Common Subsequence (LCS) in SQL

July 5th, 2010 By Frank Zhou

The following is an interesting problem posted on the programming praxis website:

Longest Common Subsequence
Finding the longest common subsequence of two sequences is a classic computer science problem with an equally
classic solution that dates to the folklore of computing. The longest common subsequence is the longest set of
elements that appear in order (not necessarily contiguous) in two sequences, the solution can be used in matching DNA sequences and it is also the basis of Unix “DIFF” utility.

The following SQL pattern can be used to find the Longest Common Subsequences in different group of strings.

COLUMN Largest_Common_Sequences FORMAT  A38
COLUMN in_str FORMAT  A18

CREATE TABLE  DATA AS
(Select 1 as id, 10 as grp_id, 'ABRACADABRA'       as in_str from dual
union all
select 2 as id, 10 as grp_id, 'BARRACUDA'          as in_str from dual
union all
select 3 as id, 20 as grp_id, 'HHTHHTHHT'          as in_str from dual
union all
select 4 as id, 20 as grp_id, 'THHTHTTHT'          as in_str from dual
union all
select 5 as id, 30 as grp_id, 'PROGRAMMING'        as in_str from dual
union all
select 6 as id, 30 as grp_id, 'PRAXIS'             as in_str from dual
union all
select 7 as id, 40 as grp_id, 'HUMAN'              as in_str from dual
union all
select 8 as id, 40 as grp_id, 'CHIMPANZEE'         as in_str from dual
union all
select 9 as id, 50 as grp_id,  '!T@E#N$o%n^e'      as in_str from dual
union all
select 10 as id, 50 as grp_id, 'TE*No()n+e{'       as in_str from dual
union all
select 11 as id, 50 as grp_id, '=T8EN7o?ne'        as in_str from dual
);

--------------------------------------------------------SQL Solution------------------------------------------------------



WITH MAX_COM_SEQ  AS
(SELECT DISTINCT id, grp_id, in_str, seq_str
  FROM 
    (SELECT a.*, length(seq_str) len, MAX(LENGTH(seq_str)) OVER (PARTITION BY grp_id) max_len
      FROM
      (SELECT a.*,COUNT(DISTINCT id) OVER (PARTITION BY grp_id,seq_str) seq_cnt 
       FROM
       (SELECT a.*, REPLACE(SYS_CONNECT_BY_PATH(chr,','),',') seq_str
         FROM (SELECT * 
                FROM (SELECT a.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id, chr) cnt_chr
                       FROM (SELECT a.*, trim(COLUMN_VALUE) chr, rownum as rn
                              FROM (SELECT data.*, COUNT(*) OVER (PARTITION BY grp_id) cnt FROM data ) a,
                                    xmltable(rtrim(REGEXP_REPLACE(a.in_str,'(.)', '"\1",'), ',')) 
                             ) a
                     )
                WHERE cnt=cnt_chr
               ) a
         CONNECT BY id=PRIOR id AND rn > PRIOR rn 
       ) a
      ) a
     WHERE cnt = seq_cnt 
  )   
  WHERE max_len = len
)
SELECT id, grp_id, in_str, CASE WHEN rank = 1 THEN '[ '|| STRAGG||' ]' END AS Largest_Common_Sequences
 FROM 
 (SELECT id, grp_id, in_str, STRAGG, rank() OVER (PARTITION BY grp_id ORDER BY id ) AS RANK
  FROM
  (SELECT DISTINCT id, grp_id, in_str, LISTAGG(seq_str, '->') WITHIN GROUP (ORDER BY seq_str) OVER(PARTITION BY id) STRAGG  
   FROM (SELECT * FROM MAX_COM_SEQ )
  )
 );




        ID     GRP_ID IN_STR             LARGEST_COMMON_SEQUENCES
---------- ---------- ------------------ --------------------------------------
         1         10 ABRACADABRA        [ ARACDA->BRACDA ]
         2         10 BARRACUDA
         3         20 HHTHHTHHT          [ HHTHTHT->THHTHHT ]
         4         20 THHTHTTHT
         5         30 PROGRAMMING        [ PRAI ]
         6         30 PRAXIS
         7         40 HUMAN              [ HMAN ]
         8         40 CHIMPANZEE
         9         50 !T@E#N$o%n^e       [ TENone ]
        10         50 TE*No()n+e{
        11         50 =T8EN7o?ne

11 rows selected.
SQL>



One Response to “How to find the Longest Common Subsequence (LCS) in SQL”

  1. newkid Says:

    11GR2:

    WITH d AS (
        SELECT dt.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id,c) cnt_c
          FROM (SELECT dt.*,SUBSTR(in_str,rn,1) c,rn
                  FROM (SELECT data.*,COUNT(*) OVER (PARTITION BY grp_id) cnt FROM data) dt
                      ,(SELECT ROWNUM rn FROM (SELECT MAX(LENGTH(in_str)) len FROM data) CONNECT BY ROWNUM<=len)
                 WHERE rn<=LENGTH(in_str)
                ) dt
    )
    ,t(id,grp_id,in_str,rn,w,cnt) AS (
        SELECT id,grp_id,in_str,rn,c,cnt FROM d
        UNION ALL
        SELECT d.id,d.grp_id,d.in_str,d.rn,t.w||d.c,t.cnt
          FROM t,d
         WHERE t.id = d.id AND t.rn<d.rn
    )
    SELECT grp_id,w
      FROM (SELECT grp_id,w,rank() OVER(PARTITION BY grp_id ORDER BY LENGTH(w) DESC) rnk
             FROM t
             GROUP BY grp_id,w
             HAVING COUNT(DISTINCT id) = MAX(cnt)
            )
    WHERE rnk=1
    ;
    
        GRP_ID W
    ---------- ------------
            10 ARACDA
            10 BRACDA
            20 HHTHTHT
            20 THHTHHT
            30 PRAI
            40 HMAN
            50 TENone
    

Leave a Reply

You must be logged in to post a comment.

RSS feed for comments on this question