How to find the Longest Common Subsequence (LCS) in SQL
July 5th, 2010 By Frank Zhou
The following is an interesting problem posted on the programming praxis website:
Longest Common Subsequence
Finding the longest common subsequence of two sequences is a classic computer science problem with an equally
classic solution that dates to the folklore of computing. The longest common subsequence is the longest set of
elements that appear in order (not necessarily contiguous) in two sequences, the solution can be used in matching DNA sequences and it is also the basis of Unix “DIFF” utility.
The following SQL pattern can be used to find the Longest Common Subsequences in different group of strings.
COLUMN Largest_Common_Sequences FORMAT A38
COLUMN in_str FORMAT A18
CREATE TABLE DATA AS
(Select 1 as id, 10 as grp_id, 'ABRACADABRA' as in_str from dual
union all
select 2 as id, 10 as grp_id, 'BARRACUDA' as in_str from dual
union all
select 3 as id, 20 as grp_id, 'HHTHHTHHT' as in_str from dual
union all
select 4 as id, 20 as grp_id, 'THHTHTTHT' as in_str from dual
union all
select 5 as id, 30 as grp_id, 'PROGRAMMING' as in_str from dual
union all
select 6 as id, 30 as grp_id, 'PRAXIS' as in_str from dual
union all
select 7 as id, 40 as grp_id, 'HUMAN' as in_str from dual
union all
select 8 as id, 40 as grp_id, 'CHIMPANZEE' as in_str from dual
union all
select 9 as id, 50 as grp_id, '!T@E#N$o%n^e' as in_str from dual
union all
select 10 as id, 50 as grp_id, 'TE*No()n+e{' as in_str from dual
union all
select 11 as id, 50 as grp_id, '=T8EN7o?ne' as in_str from dual
);
--------------------------------------------------------SQL Solution------------------------------------------------------
WITH MAX_COM_SEQ AS
(SELECT DISTINCT id, grp_id, in_str, seq_str
FROM
(SELECT a.*, length(seq_str) len, MAX(LENGTH(seq_str)) OVER (PARTITION BY grp_id) max_len
FROM
(SELECT a.*,COUNT(DISTINCT id) OVER (PARTITION BY grp_id,seq_str) seq_cnt
FROM
(SELECT a.*, REPLACE(SYS_CONNECT_BY_PATH(chr,','),',') seq_str
FROM (SELECT *
FROM (SELECT a.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id, chr) cnt_chr
FROM (SELECT a.*, trim(COLUMN_VALUE) chr, rownum as rn
FROM (SELECT data.*, COUNT(*) OVER (PARTITION BY grp_id) cnt FROM data ) a,
xmltable(rtrim(REGEXP_REPLACE(a.in_str,'(.)', '"\1",'), ','))
) a
)
WHERE cnt=cnt_chr
) a
CONNECT BY id=PRIOR id AND rn > PRIOR rn
) a
) a
WHERE cnt = seq_cnt
)
WHERE max_len = len
)
SELECT id, grp_id, in_str, CASE WHEN rank = 1 THEN '[ '|| STRAGG||' ]' END AS Largest_Common_Sequences
FROM
(SELECT id, grp_id, in_str, STRAGG, rank() OVER (PARTITION BY grp_id ORDER BY id ) AS RANK
FROM
(SELECT DISTINCT id, grp_id, in_str, LISTAGG(seq_str, '->') WITHIN GROUP (ORDER BY seq_str) OVER(PARTITION BY id) STRAGG
FROM (SELECT * FROM MAX_COM_SEQ )
)
);
ID GRP_ID IN_STR LARGEST_COMMON_SEQUENCES
---------- ---------- ------------------ --------------------------------------
1 10 ABRACADABRA [ ARACDA->BRACDA ]
2 10 BARRACUDA
3 20 HHTHHTHHT [ HHTHTHT->THHTHHT ]
4 20 THHTHTTHT
5 30 PROGRAMMING [ PRAI ]
6 30 PRAXIS
7 40 HUMAN [ HMAN ]
8 40 CHIMPANZEE
9 50 !T@E#N$o%n^e [ TENone ]
10 50 TE*No()n+e{
11 50 =T8EN7o?ne
11 rows selected.
SQL>

July 6th, 2010 at 6:28 am
11GR2:
WITH d AS ( SELECT dt.*,COUNT(DISTINCT id) OVER(PARTITION BY grp_id,c) cnt_c FROM (SELECT dt.*,SUBSTR(in_str,rn,1) c,rn FROM (SELECT data.*,COUNT(*) OVER (PARTITION BY grp_id) cnt FROM data) dt ,(SELECT ROWNUM rn FROM (SELECT MAX(LENGTH(in_str)) len FROM data) CONNECT BY ROWNUM<=len) WHERE rn<=LENGTH(in_str) ) dt ) ,t(id,grp_id,in_str,rn,w,cnt) AS ( SELECT id,grp_id,in_str,rn,c,cnt FROM d UNION ALL SELECT d.id,d.grp_id,d.in_str,d.rn,t.w||d.c,t.cnt FROM t,d WHERE t.id = d.id AND t.rn<d.rn ) SELECT grp_id,w FROM (SELECT grp_id,w,rank() OVER(PARTITION BY grp_id ORDER BY LENGTH(w) DESC) rnk FROM t GROUP BY grp_id,w HAVING COUNT(DISTINCT id) = MAX(cnt) ) WHERE rnk=1 ; GRP_ID W ---------- ------------ 10 ARACDA 10 BRACDA 20 HHTHTHT 20 THHTHHT 30 PRAI 40 HMAN 50 TENone