/* A program for global alignment with a context-dependent scoring scheme Copyright (c) 1995 Xiaoqiu Huang All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to incorporate this software into commercial products may be obtained from the Intellectual Property Office, 1400 Townsend Drive, 301 Administration Building, Houghton, MI 49931, phone (906) 487-3429. IN NO EVENT SHALL THE AUTHOR OR MICHIGAN TECHNOLOGICAL UNIVERSITY BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF MICHIGAN TECHNOLOGICAL UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MICHIGAN TECHNOLOGICAL UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND MICHIGAN TECHNOLOGICAL UNIVERSITY HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. Proper attribution of the author as the source of the software would be appreciated: A Context Dependent Method for Comparing Sequences, Proceedings of the 5th Symposium on Combinatorial Pattern Matching, Lecture Notes in Computer Science 807, Springer-Verlag, 54-63. Xiaoqiu Huang Department of Computer Science Michigan Technological University Houghton, MI 49931 E-mail: huang@cs.mtu.edu The Context-Dependent Alignment (CDA) program computes a global alignment of two sequences. It delivers the alignment in linear space, so long sequences can be aligned. Users supply scoring parameters. In the simplest form, users just provide 5 integers: ms, q, r, l and b, where ms is the score of a mismatch, q is gap-open penalty, r is gap-extension penalty, l is context length, and b is match bonus. Each match automatically receives a score of 10. The score of an i-symbol indel is -(q + r * i). A match in a substitution block receives a single left bonus of b if at least one other match occurs within l positions to the left of the match. Similarly, a match in a substitution block receives a single right bonus of b if at least one other match occurs within l positions to the right of the match. This simple scoring scheme may be used for DNA sequences. NOTE: all scores are integers. In general, users can define an alphabet of characters appearing in the sequences and a matrix that gives the context-independent substitution score for each pair of symbols in the alphabet. The 127 ASCII characters are eligible. The alphabet and matrix are given in a file, where the first line lists the characters in the alphabet and the lower triangle of the matrix comes next. An example file looks as follows: ARNDC 13 -15 19 -10 -22 11 -20 -10 -20 18 -10 -20 -10 -20 12 Here the -22 at position (3,2) is the score of replacing N by R. This general scoring scheme is useful for protein sequences where the set of protein characters and Dayhoff matrix are specified in the file. The CDA program is written in C and runs under Unix systems on Sun workstations and under DOS systems on PCs. We think that the program is portable to many machines. Sequences to be analyzed are stored in separate files. An input file contains all characters of a sequence, separated by newline characters, in linear order. No other characters are allowed. Since upper case and lower case characters are different, use the same case consistently. A sample sequence file of 4 lines is shown below. GAATTCTAATCTCCCTCTCAACCCTACAGTCACCCATTTGGTATATTAAA GATGTGTTGTCTACTGTCTAGTATCCCTCAAGTAGTGTCAGGAATTAGTC ATTTAAATAGTCTGCAAGCCAGGAGTGGTGGCTCATGTCTGTAATTCCAG CACTGGAGAGGTAGAAGTG To find the best alignment of two sequences in files A and B, use a command of form cda A B ms q r l b > result where cda is the name of the object code, ms is a negative integer specifying mismatch weight, q and r are non-negative integers specifying gap-open and gap-extend penalties, respectively, l is a positive integer giving context length, and b is a non-negative integer giving match bonus. Output alignment is saved in the file "result". For using a scoring matrix defined in file S, use a command of form cda A B S q r l b > result Note that ms is replaced by the file S. Acknowledgments The functions diff() and display() are from Gene Myers. We made the following modifications: similarity weights (integer), instead of distance weights (float), are used. A context-dependent scoring scheme is used for close matches. */ #include static int match, mismh; /* max and min substitution weights */ static char *name1, *name2; /* names of sequence files */ static int v[128][128]; /* substitution scores */ static int w[128][128]; /* context-denpendent substitution scores */ static int MatchDis; /* maximum distance between two adjacent matches */ static int q, r; /* gap penalties */ static int qr; /* qr = q + r */ static int *CC, *DD; /* saving matrix scores */ static int *RR, *SS, *EE, *FF; /* saving start-points */ static int *HH, *WW; /* saving matrix scores */ static int *II, *JJ, *XX, *YY; /* saving start-points */ static int *GG, *QQ; /* context-dependent matrix scores */ static int *FL, *BL; /* distance of the last match */ static int *S; static int *sapp; /* Current script append ptr */ static int last; /* Last script op appended */ static int no_mat; /* number of matches */ static int no_mis; /* number of mismatches */ static int al_len; /* length of alignment */ main(argc, argv) int argc; char *argv[]; { int M, N; /* Sequence lengths and k */ char *A, *B; /* Storing two sequences */ int symbol; /* The next character */ int score; int ms; /* User-supplied weights */ FILE *Bp, *Ap, *Sp, *ckopen(); char *ckalloc(); /* space-allocating function */ register int i, j; char alph[129], *s; /* alphabet */ int size, t; /* size of alphabet */ int bonus; /* match bonus */ if ( argc != 8 ) fatalf("The proper form of command is: \n%s file1 file2 mismatch(<0) gap-open(>=0) gap-extend(>0) context-length(>0) match-bonus(>=0)", argv[0]); /* determine the sequence lengths */ Ap = ckopen(argv[1], "r"); for (M = 0; ( symbol = getc(Ap)) != EOF ; ) if ( symbol != '\n' ) ++M; (void) fclose(Ap); name1 = argv[1]; /* allocate space for A */ A = ( char * ) ckalloc( (M + 1) * sizeof(char)); /* read the first sequence into A */ Ap = ckopen(argv[1], "r"); for (M = 0; ( symbol = getc(Ap)) != EOF ; ) if ( symbol != '\n' ) A[++M] = symbol; /* if there is another sequence, read it into B */ if ( argc == 8 ) { Bp = ckopen(argv[2], "r"); for (N = 0; ( symbol = getc(Bp)) != EOF ; ) if ( symbol != '\n' ) ++N; (void) fclose(Bp); name2 = argv[2]; B = ( char * ) ckalloc( (N + 1) * sizeof(char)); Bp = ckopen(argv[2], "r"); for (N = 0; ( symbol = getc(Bp)) != EOF ; ) if ( symbol != '\n' ) B[++N] = symbol; } (void) sscanf(argv[argc-4],"%d", &q); if ( q < 0 ) fatal("The gap-open penalty is a nonnegative integer"); (void) sscanf(argv[argc-3],"%d", &r); if ( r <= 0 ) fatal("The gap-extend penalty is a positive integer"); qr = q + r; /* check if the argument represents a negative integer */ s = argv[argc-5]; if ( *s == '-' ) s++; for ( ; *s >= '0' && *s <= '9' ; s++ ); if ( *s == '\0' ) { (void) sscanf(argv[argc-5],"%d", &ms); if ( ms >= 0 ) fatal("The mismatch weight is a negative integer"); match = 10; mismh = ms; /* set match and mismatch weights */ for ( i = 0; i < 128 ; i++ ) for ( j = 0; j < 128 ; j++ ) if (i == j ) v[i][j] = 10; else v[i][j] = mismh; } else { /* read a file containing alphabet and substitution weights */ Sp = ckopen(argv[argc-5], "r"); (void) fscanf(Sp, "%s", alph); size = strlen(alph); match = mismh = 0; for ( i = 0; i < size ; i++ ) for ( j = 0; j <= i ; j++ ) { (void) fscanf(Sp, "%d", &ms); v[alph[i]][alph[j]] = v[alph[j]][alph[i]] = ms; if ( ms > match ) match = ms; if ( ms < mismh ) mismh = ms; } } (void) sscanf(argv[argc-2],"%d", &MatchDis); if ( MatchDis <= 0 ) fatal("The context length is a positive integer"); (void) sscanf(argv[argc-1],"%d", &bonus); if ( bonus < 0 ) fatal("The match bonus a nonnegative integer"); for ( i = 0; i < 128 ; i++ ) for ( j = 0; j < 128 ; j++ ) if (i == j ) w[i][j] = bonus; else w[i][j] = 0; /* allocate space for all vectors */ j = (N + 1) * sizeof(int); CC = ( int * ) ckalloc(j); DD = ( int * ) ckalloc(j); RR = ( int * ) ckalloc(j); SS = ( int * ) ckalloc(j); EE = ( int * ) ckalloc(j); FF = ( int * ) ckalloc(j); GG = ( int * ) ckalloc(j); QQ = ( int * ) ckalloc(j); FL = ( int * ) ckalloc(j); BL = ( int * ) ckalloc(j); i = (M + 1) * sizeof(int); HH = ( int * ) ckalloc(i); WW = ( int * ) ckalloc(i); II = ( int * ) ckalloc(i); JJ = ( int * ) ckalloc(i); XX = ( int * ) ckalloc(i); YY = ( int * ) ckalloc(i); S = ( int * ) ckalloc(i + j); printf("Match Mismatch Gap-Open Penalty Gap-Extension Penalty Context-Length Bonus\n"); printf(" %d %d %d %d %d %d\n\n", match, mismh, q, r, MatchDis, bonus); { printf(" Upper Sequence : %s\n", name1); printf(" Length : %d\n", M); printf(" Lower Sequence : %s\n", name2); printf(" Length : %d\n", N); } sapp = S; last = 0; al_len = 0; no_mat = 0; no_mis = 0; score = diff(A, B,M,N,q,q,0,0); /* Output the best alignment */ printf(" Similarity Score : %d\n",score); printf(" Match Percentage : %d%%\n", (100*no_mat)/al_len); printf(" Number of Matches : %d\n", no_mat); printf(" Number of Mismatches : %d\n", no_mis); printf(" Total Length of Gaps : %d\n", al_len-no_mat-no_mis); display(A,B,M,N,S,1,1); fflush(stdout); } int diff(), display(); #define gap(k) ((k) <= 0 ? 0 : q+r*(k)) /* k-symbol indel score */ /* Append "Delete k" op */ #define DEL(k) \ { al_len += k; \ if (last < 0) \ last = sapp[-1] -= (k); \ else \ last = *sapp++ = -(k); \ } /* Append "Insert k" op */ #define INS(k) \ { al_len += k; \ if (last < 0) \ { sapp[-1] = (k); *sapp++ = last; } \ else \ last = *sapp++ = (k); \ } /* Append "Replace" op */ #define REP \ { last = *sapp++ = 0; \ al_len += 1; \ } /* diff(A,B,M,N,tb,te,db,de) returns the score of an optimum conversion between A[1..M] and B[1..N] and appends such a conversion to the current script. The conversion begins (ends) with a delete if tb (te) is zero, or it begins (ends) with a replacement if db (de) is positive. */ int diff(A,B,M,N,tb,te,db,de) char *A, *B; int M, N; int tb, te, db, de; { int midi, midj, type; /* Midpoint, type, and cost */ int midc; register int i, j; register int c, e, d, s; int f, g, cl, sl; int t, *va, *wa; int ai, bj; char *ckalloc(); /* Boundary cases: M <= 1 or N == 0 */ if (N <= 0) { if (M > 0) DEL(M) return - gap(M); } if (M <= 1) { if (M <= 0) { INS(N); return - gap(N); } if (tb > te) tb = te; midc = - (tb + r + gap(N) ); midj = 0; va = v[A[1]]; for (j = 1; j <= N; j++) { c = va[B[j]] - ( gap(j-1) + gap(N-j) ); if ( j == 1 ) c += db; if ( j == N ) c += de; if (c > midc) { midc = c; midj = j; } } if (midj == 0) { INS(N) DEL(1) } else { if (midj > 1) INS(midj-1) REP if ( A[1] == B[midj] ) no_mat += 1; else no_mis += 1; if (midj < N) INS(N-midj) } return midc; } /* Divide: Find optimum midpoint (midi,midj) of cost midc */ midi = M/2; /* Forward phase: */ CC[0] = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */ t = -q; for (j = 1; j <= N; j++) { CC[j] = GG[j] = t = t-r; DD[j] = t-q; FL[j] = MatchDis + 1; } t = -tb; for (i = 1; i <= midi; i++) { s = CC[0]; f = i == 1 ? db : s; CC[0] = c = t = t-r; sl = MatchDis + 1; e = t-q; va = v[(ai = A[i])]; wa = w[ai]; for (j = 1; j <= N; j++) { if ((c = c - qr) > (e = e - r)) e = c; if ((c = CC[j] - qr) > (d = DD[j] - r)) d = c; g = va[(bj = B[j])]; c = s+g; g += f; if ( ai != bj ) cl = sl + 1; else { if ( sl <= MatchDis ) g += wa[bj] + w[A[i-sl]][B[j-sl]]; if ( g < c ) g = c; cl = 1; } if (c < d) c = d; if (c < e) c = e; if (c < g) c = g; s = CC[j]; CC[j] = c; DD[j] = d; sl = FL[j]; FL[j] = cl; f = GG[j]; GG[j] = g; } } DD[0] = CC[0]; RR[N] = 0; /* Reverse phase: */ t = -q; /* Compute R(M/2,k) & S(M/2,k) for all k */ for (j = N-1; j >= 0; j--) { RR[j] = QQ[j] = t = t-r; SS[j] = t-q; BL[j] = MatchDis + 1; } t = -te; for (i = M-1; i >= midi; i--) { s = RR[N]; f = (i == M-1) ? de : s; RR[N] = c = t = t-r; sl = MatchDis + 1; e = t-q; va = v[(ai = A[i+1])]; wa = w[ai]; for (j = N-1; j >= 0; j--) { if ((c = c - qr) > (e = e - r)) e = c; if ((c = RR[j] - qr) > (d = SS[j] - r)) d = c; g = va[(bj = B[j+1])]; c = s+g; g += f; if ( ai != bj ) cl = sl + 1; else { if ( sl <= MatchDis ) g += wa[bj] + w[A[i+1+sl]][B[j+1+sl]]; if ( g < c ) g = c; cl = 1; } if (c < d) c = d; if (c < e) c = e; if (c < g) c = g; s = RR[j]; RR[j] = c; SS[j] = d; sl = BL[j]; BL[j] = cl; f = QQ[j]; QQ[j] = g; } } SS[N] = RR[N]; midc = CC[0]+RR[0]; /* Find optimal midpoint */ midj = 0; type = 1; for (j = 0; j <= N; j++) if ((c = CC[j] + RR[j]) >= midc) if (c > midc || CC[j] != DD[j] && RR[j] == SS[j]) { midc = c; midj = j; } for (j = N; j >= 0; j--) if ((c = DD[j] + SS[j] + q) > midc) { midc = c; midj = j; type = 2; } for (j = N-1; j > 0; j--) if ( FL[j] + BL[j] - 1 <= MatchDis ) { f = w[A[midi-FL[j]+1]][B[j-FL[j]+1]] + w[A[midi+BL[j]]][B[j+BL[j]]]; if ( ( c = GG[j] + QQ[j] + f ) > midc ) { midc = c; midj = j; type = 3; g = f; cl = FL[j] - 1; sl = BL[j] - 1; } } /* Conquer: recursively around midpoint */ if (type == 1) { diff(A,B,midi,midj,tb,q,db,0); diff(A+midi,B+midj,M-midi,N-midj,q,te,0,de); } else if ( type == 2) { diff(A,B,midi-1,midj,tb,0,db,0); DEL(2); diff(A+midi+1,B+midj,M-midi-1,N-midj,0,te,0,de); } else { diff(A,B,midi-cl,midj-cl,tb,q,db,g); for ( i = 1; i <= cl + sl; i++ ) { REP if ( A[midi-cl+i] == B[midj-cl+i] ) no_mat += 1; else no_mis += 1; } diff(A+midi+sl,B+midj+sl,M-midi-sl,N-midj-sl,q,te,g,de); } return midc; } /* Alignment display routine */ static char ALINE[51], BLINE[51], CLINE[51]; int display(A,B,M,N,S,AP,BP) char A[], B[]; int M, N; int S[], AP, BP; { register char *a, *b, *c; register int i, j, op; int lines, ap, bp; i = j = op = lines = 0; ap = AP; bp = BP; a = ALINE; b = BLINE; c = CLINE; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; *a = A[++i]; *b = B[++j]; *c++ = (*a++ == *b++) ? '|' : ' '; } else { if (op == 0) op = *S++; if (op > 0) { *a++ = ' '; *b++ = B[++j]; op--; } else { *a++ = A[++i]; *b++ = ' '; op++; } *c++ = '-'; } if (a >= ALINE+50 || i >= M && j >= N) { *a = *b = *c = '\0'; printf("\n%5d ",50*lines++); for (b = ALINE+10; b <= a; b += 10) printf(" . :"); if (b <= a+5) printf(" ."); printf("\n%5d %s\n %s\n%5d %s\n",ap,ALINE,CLINE,bp,BLINE); ap = AP + i; bp = BP + j; a = ALINE; b = BLINE; c = CLINE; } } } /* lib.c - library of C procedures. */ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }