Sequence alignment (25) editing distance

This paper introduces the editing distance of two strings and gives the code.

Edit distance

Editing distance is the minimum step required to convert one string into another given two strings. The changes include inserting a character, deleting a character, and replacing a character. For example: v = TGCATAT BM {v} = TGCATATv = TGCATAT and w = ATCCGAT BM {w} = ATCCGATw = ATCCGAT two strings edit distance of 4.

The process of solving editing distance is very similar to global alignment (for global alignment, see the previous "Needleman-Wunsch algorithm for global alignment (1)"). All symbols are required to participate in the alignment, and insertion, deletion and mismatch are allowed. Therefore, the editing distance can be solved by dynamic programming algorithm, and its iteration formula is as follows:

F(i,j) is the minimum score of alignments between x1...i and y1...j.F(i,0)=ifor i=0...m.F(0,j)=jfor j=1...n.s(i,j)={0if xi=yj,1otherwise.F(i,j)=min⁡{F(i−1,j)+1,F(i,j−1)+1,F(i−1,j−1)+s(i,j)\begin{aligned} & \text{$F(i,j)$ is the minimum score of alignments between $x_{1 \ldots i}$ and $y_{1 \ldots j}$.} \\ & F(i, 0) = i \quad \text{for $i = 0 \ldots m$.} \\ & F(0, j) = j \quad \text{for $j = 1 \ldots n$.} \\ & s(i,j) = \begin{cases} 0 & \text{if $x_i = y_j$,} \\ 1 & \text{otherwise.} \end{cases} \\ & F(i, j) = \min \begin{cases} F(i - 1, j) + 1, \\ F(i, j - 1) + 1, \\ F(i - 1, j - 1) + s(i, j) \end{cases} \end{aligned}​F(i,j) is the minimum score of alignments between x1...i​ and y1...j​.F(i,0)=ifor i=0...m.F(0,j)=jfor j=1...n.s(i,j)={01​if xi​=yj​,otherwise.​F(i,j)=min⎩⎪⎨⎪⎧​F(i−1,j)+1,F(i,j−1)+1,F(i−1,j−1)+s(i,j)​​

The results are as follows:

(Public No. Shengxin)

Editing Distance and Longest Common Subsequence

When only insertion and deletion are allowed and mismatch is not allowed, the editing distance of two strings can be calculated indirectly by the length of the longest common subsequence (for the longest common subsequence, you can refer to the previous article "Sequence alignment (24) longest common subsequence").

In the case that only insertion and deletion are allowed and mismatch is not allowed, it is assumed that given two strings v BM {v} V and w BM {w} w, the lengths are mmm and nnn, respectively. If the length of the longest common subsequence is recorded as LC (v, w) LC ( BM {v}, BM {w}) LC (v, w) and the editing distance is recorded as DE (v, w) DE ( BM {v}, \ BM {w}) DE (v, w), then:
DE(v,w)=m+n−2×LC(v,w) DE(\bm{v}, \bm{w}) = m + n - 2 \times LC(\bm{v}, \bm{w}) DE(v,w)=m+n−2×LC(v,w)

Code for solving edit distance

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXSEQ 1000
#define GAP_CHAR '-'
#define min(a, b) ((a) < (b) ? (a) : (b))
#define diff(a, b) ((a) == (b) ? 0 : 1)

struct Unit {
    int W1;   // Whether to go back to the top
    int W2;   // Whether to go back to the left or not
    int W3;   // Whether to go back to the left
    int M;      // The score of the unit (i, j) of the score matrix, i. e. the lowest score of the comparison between sequence s(1,...,i) and sequence r(1,...,j)
};
typedef struct Unit *pUnit;

void strUpper(char *s);
void printAlign(pUnit** a, const int i, const int j, char* s, char* r, char* saln, char* raln, int n);
void align(char *s, char *r);

int main() {
    char s[MAXSEQ];
    char r[MAXSEQ];
    printf("The 1st seq: ");
    scanf("%s", s);
    printf("The 2nd seq: ");
    scanf("%s", r);
    align(s, r);
    return 0;
}

void strUpper(char *s) {
    while (*s != '\0') {
        if (*s >= 'a' && *s <= 'z') {
            *s -= 32;
        }
        s++;
    }
}

void printAlign(pUnit** a, const int i, const int j, char* s, char* r, char* saln, char* raln, int n) {
    int k;
    pUnit p = a[i][j];
    if (! i && ! j) {   // Arrival (0, 0)
        for (k = n - 1; k >= 0; k--)
            printf("%c", saln[k]);
        printf("\n");
        for (k = n - 1; k >= 0; k--)
            printf("%c", raln[k]);
        printf("\n\n");
        return;
    }
    if (p->W1) {    // Backtracking up
        saln[n] = s[i - 1];
        raln[n] = GAP_CHAR;
        printAlign(a, i - 1, j, s, r, saln, raln, n + 1);
    }
    if (p->W2) {    // Backtracking to the left
        saln[n] = s[i - 1];
        raln[n] = r[j - 1];
        printAlign(a, i - 1, j - 1, s, r, saln, raln, n + 1);
    }
    if (p->W3) {     // Backtracking to the left
        saln[n] = GAP_CHAR;
        raln[n] = r[j - 1];
        printAlign(a, i, j - 1, s, r, saln, raln, n + 1);
    }
}

void align(char *s, char *r) {
    int i, j;
    int m = strlen(s);
    int n = strlen(r);
    int m1, m2, m3, minm;
    pUnit **aUnit;
    char* salign;
    char* ralign;
    // Initialization
    if ((aUnit = (pUnit **) malloc(sizeof(pUnit*) * (m + 1))) == NULL) {
        fputs("Error: Out of space!\n", stderr);
        exit(1);
    }
    for (i = 0; i <= m; i++) {
        if ((aUnit[i] = (pUnit *) malloc(sizeof(pUnit) * (n + 1))) == NULL) {
            fputs("Error: Out of space!\n", stderr);
            exit(1);     
        }
        for (j = 0; j <= n; j++) {
            if ((aUnit[i][j] = (pUnit) malloc(sizeof(struct Unit))) == NULL) {
                fputs("Error: Out of space!\n", stderr);
                exit(1);     
            }
            aUnit[i][j]->W1 = 0;
            aUnit[i][j]->W2 = 0;
            aUnit[i][j]->W3 = 0;
        }
    }
    for (i = 0; i <= m; i++) {
        aUnit[i][0]->M = i;
        aUnit[i][0]->W1 = 1;
    }
    for (j = 1; j <= n; j++) {
        aUnit[0][j]->M = j;
        aUnit[0][j]->W3 = 1;
    }
    // Turn strings into capitals
    strUpper(s);
    strUpper(r);
    // Dynamic programming algorithm calculates the score of each unit of score matrix
    for (i = 1; i <= m; i++) {
        for (j = 1; j <= n; j++) {
            m1 = aUnit[i - 1][j]->M + 1;
            m2 = aUnit[i - 1][j - 1]->M + diff(s[i - 1], r[j - 1]);
            m3 = aUnit[i][j - 1]->M + 1;
            minm = min(min(m1, m2), m3);
            aUnit[i][j]->M = minm;
            if (m1 == minm) aUnit[i][j]->W1 = 1;
            if (m2 == minm) aUnit[i][j]->W2 = 1;
            if (m3 == minm) aUnit[i][j]->W3 = 1;
        }
    }
/*
    // Print Score Matrix
    for (i = 0; i <= m; i++) {
        for (j = 0; j <= n; j++)
            printf("%d ", aUnit[i][j]->M);
        printf("\n");
    }
*/
    printf("min score: %d\n", aUnit[m][n]->M);
    // Print the best comparison results, if there are more than one, print them all.
    // Recursive method
    if (aUnit[m][n]->M == 0) {
        fputs("Two seqs are totally the same.\n", stdout);
    } else {
        if ((salign = (char*) malloc(sizeof(char) * (m + n))) == NULL || \
            (ralign = (char*) malloc(sizeof(char) * (m + n))) == NULL) {
            fputs("Error: Out of space!\n", stderr);
            exit(1);
        }
        printAlign(aUnit, m, n, s, r, salign, ralign, 0);
        // Free memory
        free(salign);
        free(ralign);
    }
    for (i = 0; i <= m; i++) {
        for (j = 0; j <= n; j++)
            free(aUnit[i][j]);
        free(aUnit[i]);
    }
    free(aUnit);
}

Tags: Programming

Posted on Wed, 11 Sep 2019 00:28:21 -0700 by kalpesh