Repeated DNA sequences

2 min readFeb 2, 2022

Below is the problem description:

The DNA sequence is composed of a series of nucleotides abbreviated as 'A', 'C', 'G', and 'T'.

For example, "ACGAATTCCG" is a DNA sequence.

When studying DNA, it is useful to identify repeated sequences within the DNA.

Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.

Brute force solution to this problem:

struct hnode{
char key[10];
int val;
UT_hash_handle hh;
};
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
char ** findRepeatedDnaSequences(char * s, int* returnSize){
int ii, sl,jj,reslen=0;
int *key;
struct hnode *node, *temp;
struct hnode *ht = NULL;
char **res;

sl = strlen(s);
*returnSize = 0;
if (sl <=10)
return NULL;
res = (char **)malloc(sl*sizeof(char *));
node = (struct hnode *) malloc(sizeof(struct hnode));
for (ii=0; ii<10; ii++)
node->key[ii] = s[ii];
node->val = 1;
HASH_ADD(hh, ht, key, 10, node);
for (ii=1; ii<=(sl-10);ii++){
HASH_FIND(hh, ht, s+ii, 10, node);
if (!node){
node = (struct hnode *) malloc(sizeof(struct hnode));
for (jj=0; jj<10;jj++){
node->key[jj] = s[ii+jj];
}
node->val= 1;
HASH_ADD(hh,ht, key, 10,node);
}
else if (node->val==1) {
res[reslen] = (char *)calloc(11,sizeof(char));
for (jj=0;jj<10;jj++)
res[reslen][jj] = node->key[jj];
reslen++;
node->val++;
}
}
*returnSize = reslen;
return res;
}

This is a tough problem.
Challenge is to avoid string comparisons and to avoid storing strings.
I learnt from solutions approach and other solutions to make use of indexed array.
Then used array of bits to for indexed array.
This reduced memory.
As there is no hash table, time improved.
Below is my solution:

/* fastest solution is to avoid mallocs/callocs
We can avoid these when not using hash table , so to use indexed array.
If we are representing bits for letters.
and in this case 0,1,2,3 and 10 letters. 
Max value is : in binary 1111 1111 1111 1111 1111 (0xfffff)
indexed array size should be 0xfffff
If we can use bits then array size is 0x1ffff
*/
char ** findRepeatedDnaSequences(char * s, int* returnSize){
    int ht[0x20000], node_2_res[0x20000];
    int key=0, ii, jj,byte_pos,bit_pos;
    int keymask = 0xfffff;
    int chval[]={1,5,2,5,5,5,3,5,5,5,5,5,5,5,5,5,5,5,5,0};
    char **res;
    int reslen = 0, sl;    memset(ht, 0, sizeof(ht));
    memset(node_2_res, 0, sizeof(node_2_res));
    
    sl =  strlen(s);
    for (ii=0; ii<10 && s[ii] !='\0'; ii++){
        key = (key<<2)| chval[s[ii]-'A'];
     //   printf("s[%d] %c, key %x\n",ii, s[ii], key);
    }    
    *returnSize = 0;
    if (sl<=10)
        return NULL;
    
    int res_ind[sl/2];
    
    byte_pos = key/8;
    bit_pos = key %8;
    ht[byte_pos] |= (1<<bit_pos);
       // printf("s[%d] %c, key %x, byte_pos %x, bit_pos %x\n",ii, s[ii], key, byte_pos, bit_pos);
    for (ii=10;ii<sl;ii++){
        key = ((key<<2)&keymask) | chval[s[ii]-'A'];
        byte_pos = key/8;
        bit_pos = key % 8;
       // printf("reslen %d s[%d] %c, key %x, byte_pos %x, bit_pos %x\n",reslen, ii, s[ii], key, byte_pos, bit_pos);
        /* if exists in index table */
        if (ht[byte_pos] & (1<<bit_pos)){
            
            /* node already not added in res */
            if (!(node_2_res[byte_pos] & (1<<bit_pos))){
                res_ind[reslen++]=ii-9;
                node_2_res[byte_pos] |= (1<<bit_pos);
            } 
        }else
            ht[byte_pos] |= (1<<bit_pos);
    }
    res = (char **)malloc(reslen*sizeof(char*));
    *returnSize = reslen;
    for (ii=0;ii<reslen;ii++){
        res[ii] = (char *)calloc(11,sizeof(char));
        for (jj=0;jj<10;jj++)
            res[ii][jj] = s[res_ind[ii]+jj];
    }
    return res;
}

Repeated DNA sequences

Written by Jyothi

No responses yet