From: GOLD::GOLD::WINS%"mark%msw.usc.edu@oberon.usc.edu" 7-APR-1989 15:49 To: GILBERTD Subj: ralign Return-Path: <@jade.bacs.indiana.edu:mark%msw.usc.edu@oberon.usc.edu> Received: from jade.bacs.indiana.edu by gold.bacs.indiana.edu with SMTP ; Fri, 7 Apr 89 15:48:34 EST Received: from oberon.usc.edu by jade.bacs.indiana.edu with SMTP ; Fri, 7 Apr 89 15:45:51 EST Received: from msw.usc.edu by oberon.usc.edu (5.59/5.5) id AA25209; Fri, 7 Apr 89 13:42:09 PST Received: by msw.usc.edu (4.0/SMI-3.0DEV3) id AA16348; Fri, 7 Apr 89 13:40:23 PDT Date: Fri, 7 Apr 89 13:40:23 PDT From: mark%msw.usc.edu@oberon.usc.edu (Mark) Message-Id: <8904072040.AA16348@msw.usc.edu> To: gilbertd@gold.bacs.indiana.edu Subject: ralign Here are the ralign sources. #makefile .SUFFIXES: .o .c.o:;cc -c -O $*.c bitnet.o ralign.o rstuff.o readin.o:ralign.h ralign::ralign.o rstuff.o readin.o cc -o ralign ralign.o rstuff.o readin.o bitnet::ralign.o rstuff.o bitnet.o cc -o bitnet ralign.o rstuff.o bitnet.o bitnet.o ralign.o rstuff.o readin.o:ralign.h #ralign.c /************************************************************************************/ /* Consensus Alignment Algorithm */ /* code by Mark Eggert and Michael S. Waterman */ /* see discussion of "algorithm S" in MULTIPLE SEQUENCE ALIGNMENT BY CONSENSUS */ /* by Michael S. Waterman , Nucleic Acids Research, Vol. 14 #22, 1986 */ /* */ /* "ralign.c" of "ralign.h", "ralign.c", "rstuff.c" and "readin.c" 3/31/87 */ /*************************************************************************************/ /*************************************************************************************/ /* The program also needs "rstuff.c" and "readin.c". The program is invoked thus: */ /* ralign file window_size word_size max_mismatches max_insertions max_deletions */ /*************************************************************************************/ #include #define VAR #include "ralign.h" int mism=0, ins=0, del=0; /* cumulative score at i is R[i] */ int *R; /* initialized so indices run from -MAXWORDSIZE to MAXCOLS-MAXWORDSIZE-1 */ /* winning word at i is Rword[i] */ int *Rword; /* -MAXWORDSIZE to MAXCOLS-MAXWORDSIZE-1 */ /* cum score at i came from adding column score to cum score at M[i] */ int *M; /* -MAXWORDSIZE to MAXCOLS-MAXWORDSIZE-1 */ /* position of word found in column i, row j is backup[i][j] */ /* this defines the actual limits of scanning for column i+1 */ int (*backup)[MAXLINES]; /* [-MAXWORDSIZE to MAXCOLS-MAXWORDSIZE-1][0 to MAXLINES-1] */ /* characters in file */ char file_letters[MAXLINES][MAXCOLS]; /* number of rows in file, maximum number of columns */ int file_rows, file_columns; /* max deletions, insertions, mismatches */ int dellevel, inslevel, mismlevel; /* secret variables */ int maxcolumnscore, printmaxadd; /* character after char c is next_char[c] */ char next_char[4]={1,2,3,0}; /* hash table, last score added to hash table */ short hashtable[1<<(MAXWORDSIZE*2)], lastscore[1<<(MAXWORDSIZE*2)]; /* a global string */ char maxword[32]; /* storage for locations of found words */ int backup_tmp[MAXLINES]; /* data for output algorithm */ struct {short nthings; int things[MAXCOLS];} seqthings; int positions[MAXCOLS]; int segsize; score_file() { int i; /* index to score array */ int l; /* difference between W and scanning window size */ int aim, S, maxs; int newword; /* calculate cutoff */ aim=(double)(wordsize*file_rows)*cutoff_factor; /* start at wordsize-1 */ for (i=wordsize-1; iR[i]) { R[i]=maxs; /* if no pattern is chosen here */ if (S==0) { /* note lack of pattern */ Rword[i]= -1; /* copy former scanning limits */ backcopy(backup[i-W+l], backup[i]); } else { /* note pattern */ Rword[i]=newword; /* copy new scanning limits */ backcopy(backup_tmp, backup[i]); } /* note last position of cumulative score */ M[i]=i-W+l; } } } } int score(start, b, rw) int start; /* left of window */ int b; /* right of window */ int *rw; /* where to return consensus word */ { int a; /* left end of scan (depends on consensus words from earlier scans) */ int row; /* index to row in file */ int l; /* left end of word */ int r; /* right end of word */ int c; char *line; int hashaddress; char delline[32]; int bail; /* if window is too small, don't scan */ if (b-start+1=4) { bail=1; break; } if (bail) continue; /* clear table of last scores */ clear_lastscore(); /* left side of word to scan */ l=a; /* right side of word */ r=l+wordsize-1; /* move along window */ for (; r<=b; ++l, ++r) { /* start exact matches */ /* convert chars to int */ hashaddress=0; for (c=l; c<=r; ++c) { hashaddress<<=2; hashaddress|=3&line[c]; } /* count exact match of "hashaddress" */ if (wordsize>lastscore[hashaddress]) { /* new score */ hashtable[hashaddress]-=lastscore[hashaddress]; hashtable[hashaddress]+=wordsize; /* remember what this score was */ lastscore[hashaddress]=wordsize; /* is this a max? */ if (hashtable[hashaddress]>maxcolumnscore) { maxcolumnscore=hashtable[hashaddress]; printmaxadd=hashaddress; } else if (hashtable[hashaddress]==maxcolumnscore) { if (hashaddress>printmaxadd) { printmaxadd=hashaddress; } } } /* end exact matches */ /* start mismatches */ for (mismlevel=1; mismlevel<=mism; ++mismlevel) mismatches(line, l, r, mismlevel); /* end mismatches */ } /* start insertions */ if (ins>0) /* check insertions from least to greatest */ for (inslevel=1; inslevel<=ins; ++inslevel) { /* set left side */ l=a; /* set right side */ r=l+wordsize-1+inslevel; /* while right side is inside window */ for (; r<=b; ++l, ++r) insertions(line, l+1, r-1, inslevel); } /* end insertions */ /* begin deletions */ if (del>0) { /* check deletions from least to greatest */ for (dellevel=1; dellevel<=del; ++dellevel) { /* set left side */ l=a; /* set right side */ r=l+wordsize-1-dellevel; /* while right side is inside window */ for (; r<=b; ++l, ++r) { /* copy word to expand */ segcpy(delline, &line[l], wordsize-dellevel); /* operate on COPY */ deletions(delline, 0, r-l, dellevel); } } } /* end deletions */ } *rw=printmaxadd; make_word(printmaxadd); load_backup_tmp(start, b, printmaxadd); return(maxcolumnscore); } mismatches(line, l, r, m) register char *line; /* pointer to characters to check */ int l; /* position of left char */ int r; /* position of right char */ int m; /* depth of recursion from bottom */ { register int hashaddress; /* index in hash table */ register int i, c; int e, j; /* various indices */ /* if we're at the bottom of recursion */ if (m==1) { /* scan line from left to right */ for (i=l; i<=r; ++i) { /* for 3 different letters... */ for (j=0; j<3; ++j) { /* replace a char */ line[i]=next_char[line[i]]; /* convert word to index */ /* clear hash address word */ hashaddress=0; /* e is index of last char in word */ e=l+wordsize-1; /* shift chars into word */ for (c=l; c<=e; ++c) { /* note first shift is no-op */ hashaddress<<=2; hashaddress|=3&line[c]; } /* now hashaddress has word index */ /* if the last score for this word was lower than the current score or the word hasn't yet been found in this line... */ if (wordsize-mismlevel>lastscore[hashaddress]) { /* undo last score */ hashtable[hashaddress]-=lastscore[hashaddress]; /* add new score */ hashtable[hashaddress]+=wordsize-mismlevel; /* set last score */ lastscore[hashaddress]=wordsize-mismlevel; /* if this is max so far... */ if (hashtable[hashaddress]>maxcolumnscore) { /* set new max */ maxcolumnscore=hashtable[hashaddress]; /* store index globally */ printmaxadd=hashaddress; } else if (hashtable[hashaddress]==maxcolumnscore) { if (hashaddress>printmaxadd) { /* store index globally */ printmaxadd=hashaddress; } } } } /* restore original character */ line[i]=next_char[line[i]]; } } else { /* create one change at i in substring from l+m-1 to r... */ for (i=l+m-1; i<=r; ++i) { for (j=0; j<3; ++j) { line[i]=next_char[line[i]]; /* rest of changes to substring from l to i-1 */ mismatches(line, l, i-1, m-1); } /* restore char at i */ line[i]=next_char[line[i]]; } } } /* clear array of last scores */ clear_lastscore() { register long *lp=(long *)lastscore; register int i=(1<<(wordsize*2))/2; for (;--i>=0;) *lp++=0; } /* turn hash index into chars */ make_word(w) int w; { int i, j=0; for (i=wordsize; --i>=0; ) maxword[j++]=(w>>(i*2))&3; } /* location and score for output */ int showl, showr, showscore; /* find locations of winning consensus word to establish margins */ load_backup_tmp(start, b, wordette) int start, b, wordette; { int row; int l, a; register int r, c; register char *line; register int match, q; char delline[32]; int bail; printmaxadd=wordette; for (row=0; row=4) { bail=1; break; } if (bail) continue; l=a; r=l+wordsize-1; for (; r<=b; ++l, ++r) { /* start exact matches */ match=0; q=0; for (c=l; c<=r; ++c) { if (line[c]==maxword[q++]) ++match; } if (match>=wordsize-mism&&match>showscore) { showl=l; showr=r; showscore=match; } } for (inslevel=1; inslevel<=ins; ++inslevel) if (a+wordsize-1+inslevel<=b) { r=0; for (l=a; r0) { showl=l; showr=l+wordsize-dellevel-1; showscore=wordsize-dellevel; break; } } } if (showscore>0) backup_tmp[row]=showr+1; } } int minshowl, maxshowl; int reveal(start, b) int start, b; { int row; int a, l; register int r, c; register char *line; register int match, q; char delline[32]; int maxdell, maxdelr=0; int newsegsize; int bail; segsize=0; minshowl=b+W; maxshowl=0; for (row=0; row=4) { bail=1; break; } } if (bail) continue; l=a; r=l+wordsize-1; for (; r<=b; ++l, ++r) { /* start exact matches */ match=0; q=0; /* find positions of mismatch words */ for (c=l; c<=r; ++c) { if (line[c]==maxword[q++]) ++match; } if (match>=wordsize-mism&&match>showscore) { showl=l; if (maxshowll) minshowl=l; showr=r; showscore=match; } } /* find insertion words */ for (inslevel=1; inslevel<=ins; ++inslevel) if (a+wordsize-1+inslevel<=b) { r=0; for (l=a; r0) { showl=l; if (showl-dellevel>maxshowl) maxshowl=showl; if (minshowl>l) minshowl=l; showr=l+wordsize-dellevel-1; showscore=wordsize-dellevel; if (dellevel-maxdell+1>maxdelr) maxdelr=dellevel-maxdell+1; break; } } } /* actually mark line for output */ if (showscore>0) { if (showlsegsize) segsize=newsegsize; /* each marked word is associated with a column */ } else /* mark place where no word found */ line[a]|=0x20; } return(maxshowl-minshowl); } /* move along sequence, generating words in neighborhood if we create a word matching the winner, we mark the source word for display */ reveal_insertions(line, l, r, m) char *line; /* left position of group to scan, right position */ int l, r; /* recursion depth */ int m; { int hashaddress; int c, e, i; /* if end of recursion */ if (m==1) { /* scan from left to right */ for (i=l; i<=r; ++i) { /* mark a char in line for deletion */ line[i]|=(char)0x80; hashaddress=0; /* last char position to convert - 1 */ e=l+wordsize-2+inslevel; for (c=l-1; cshowscore) { /* mark position for output */ showl=l-1; if (minshowl>showl) minshowl=showl; showr=e; showscore=wordsize-inslevel; } } line[i]&= ~0x80; } } else { /* mark one char for deletion, recurse on subword to deleted char's left */ for (i=l+m-1; i<=r; ++i) { if (i-1>=l) { line[i]|=0x80; reveal_insertions(line, l, i-1, m-1); line[i]&= ~0x80; } } } } int reveal_deletions(line, l, r, m) char *line; int l, r, m; { int i, c, d, p; char delline[32]; register int hashaddress; int mr; if (m==1) { /* choose a position for insertion (i) */ for (i=l+1; i<=r; ++i) { for (d=0; d<4;) { p=0; /* copy line, adding new char at i */ for (c=0; cshowscore) { /* return "win" */ return(m); } } b3:++d; } } } else { /* choose insertion position */ for (i=l+1; i<=r; ++i) { for (d=0; d<4;) { p=0; /* copy, then insert */ for (c=0; c0) return(mr); b1:++d; } } } return(0); } insertions(line, l, r, m) char *line; int l, r, m; { register int hashaddress; register int c, e, i; register short *lastscorep, *hashp; if (m==1) { lastscorep=lastscore; hashp=hashtable; /* scan from left to right... */ for (i=l; i<=r; ++i) { /* mark one char for deletion */ line[i]|=(char)0x80; hashaddress=0; /* calculate last position to check */ e=l+wordsize-2+inslevel; /* scan and make address, omitting marked chars */ for (c=l-1; clastscorep[hashaddress]) { hashp[hashaddress]-=lastscorep[hashaddress]; hashp[hashaddress]+=wordsize-inslevel; lastscorep[hashaddress]=wordsize-inslevel; if (hashp[hashaddress]>maxcolumnscore) { maxcolumnscore=hashp[hashaddress]; printmaxadd=hashaddress; } else if (hashp[hashaddress]==maxcolumnscore) { if (hashaddress>printmaxadd) { printmaxadd=hashaddress; } } } /* restore line */ line[i]&= ~0x80; } } else { /* mark char for deletion */ for (i=l+m-1; i<=r; ++i) { if (i-1>=l) { line[i]|=0x80; /* recurse on subsequence left of deletion */ insertions(line, l, i-1, m-1); line[i]&= ~0x80; } } } } deletions(line, l, r, m) char *line; int l, r, m; { int i, c, d, p; char delline[32]; register int hashaddress; register short *lastscorep, *hashp; if (m==1) { lastscorep=lastscore; hashp=hashtable; /* insertion position is i */ for (i=l+1; i<=r; ++i) { /* for all new chars */ for (d=0; d<4;) { p=0; /* copy word with added char */ for (c=0; clastscorep[hashaddress]) { hashp[hashaddress]-=lastscorep[hashaddress]; hashp[hashaddress]+=wordsize-dellevel; lastscorep[hashaddress]=wordsize-dellevel; if (hashp[hashaddress]>maxcolumnscore) { maxcolumnscore=hashp[hashaddress]; printmaxadd=hashaddress; } else if (hashp[hashaddress]==maxcolumnscore) { if (hashaddress>printmaxadd) { printmaxadd=hashaddress; } } } b2:++d; } } } else { /* copy and insert */ for (i=l+1; i<=r; ++i) { for (d=0; d<4;) { p=0; for (c=0; c=0) *d++= *s++; } /* clear hashtable */ clear_hashtable() { register short *hp=hashtable; register int i=1<<(wordsize*2); for (;--i>=0;) *hp++=0; } /* list scores for chosen columns */ dump_scores() { int i; for (i=file_columns-1; i>=wordsize-1;) { if (Rword[i]!=-1&&R[i]>0) { /* print score */ printf("%d %4.2f ", i, (float)R[i]/(float)wordsize); /* print word at i */ dump_word(Rword[i]); /* decode word to maxword */ make_word(Rword[i]); /* note word */ printmaxadd=Rword[i]; /* search for patterns, */ /* assign space needed for output */ positions[i]=1+reveal(M[i]+1, i); seqthings.things[seqthings.nthings++]=i; putchar('\n'); } /* skip to next linked position */ i=M[i]; } } /* output word from hash address */ dump_word(w) register int w; { register int i; for (i=wordsize; --i>=0; ) putchar(basetosmall[(w>>(i*2))&3]); } /* mark chars for output */ int segmark(d, n) register char *d; register int n; { while (--n>=0) *d++|=0x40; *--d|=0x80; for (n=1; d[n]<4; ++n); if (n>W-wordsize) return(W-wordsize); return(n-1); } /* produce output */ align() { int i, row; char c; /* physical position on page */ int r=0, j=0; int thingi; /* index to things */ int startmark=0; /* scan length, total displacement */ int displace; displace=0; r=0; for (i=0; i=j) break; displace+=positions[seqthings.things[thingi]]-1; } */ for (;r extern int file_rows, file_columns; extern char file_letters[MAXLINES][MAXCOLS]; int readin(filename) char *filename; { FILE *input; int rows, columns; lettertobase['A']=0; lettertobase['a']=0; lettertobase['C']=1; lettertobase['c']=1; lettertobase['G']=2; lettertobase['g']=2; lettertobase['U']=3; lettertobase['u']=3; lettertobase['T']=3; lettertobase['t']=3; lettertobase['*']=4; basetoletter[0]='A'; basetoletter[1]='C'; basetoletter[2]='G'; basetoletter[3]='T'; basetosmall[0]='a'; basetosmall[1]='c'; basetosmall[2]='g'; basetosmall[3]='t'; for (rows=0; rowsMAXCOLS) printf("line too long (greater than %d)\n", MAXCOLS); file_rows=rows; if (rows>MAXLINES) printf("too many lines (greater than %d)\n", MAXLINES); file_columns=columns; getinput(input); #ifdef DEBUG writeout(); #endif return(0); } #define DONE 0 #define TITLE 1 #define DATA 2 #define FIND 3 scaninput(input, maxrows, maxcolumns) FILE *input; int *maxrows, *maxcolumns; { register int c, state, columns=0; *maxrows=0; *maxcolumns=0; for (state=TITLE; state!=DONE;) { c=getc(input); switch (state) { case TITLE: switch (c) { case '\n': state=DATA; break; case EOF: state=DONE; } break; case FIND: switch (c) { case '\n': ++*maxrows; if (columns>*maxcolumns) *maxcolumns=columns; columns=0; state=TITLE; continue; break; case EOF: ++*maxrows; if (columns>*maxcolumns) *maxcolumns=columns; state=DONE; return; break; default: state=DATA; } case DATA: switch(c) { case '\n': state=FIND; break; case EOF: break; case 'A': case 'a': case 'C': case 'c': case 'G': case 'g': case 'T': case 't': case 'U': case 'u': case '*': ++columns; } } } } getinput(input) FILE *input; { register int c, state; register char *rp=file_letters[0], *cp=rp; for (state=TITLE; state!=DONE;) { c=getc(input); switch (state) { case TITLE: switch (c) { case '\n': state=DATA; break; } break; case FIND: switch (c) { case '\n': *cp=20; rp+=MAXCOLS; cp=rp; state=TITLE; continue; break; case EOF: *cp=20; state=DONE; return; break; default: state=DATA; } case DATA: switch(c) { case '\n': state=FIND; break; case 'A': case 'a': case 'C': case 'c': case 'G': case 'g': case 'T': case 't': case 'U': case 'u': case '*': *cp++=lettertobase[c]; } } } } writeout() { register int row; register char *rp=file_letters[0], *cp; for (row=0; row extern int mism, ins, del; extern int file_rows, file_columns; /* allocation space */ int Ralloc[MAXCOLS]; int Rwordalloc[MAXCOLS]; int Malloc[MAXCOLS]; int backupalloc[MAXCOLS][MAXLINES]; set_arrays() { extern int *R; extern int *Rword; extern int *M; extern int (*backup)[MAXLINES]; /* connect pointers to allocation space */ R= &Ralloc[MAXWORDSIZE]; Rword= &Rwordalloc[MAXWORDSIZE]; M= &Malloc[MAXWORDSIZE]; backup=(int (*)[MAXLINES])backupalloc[MAXWORDSIZE]; } main(ac, av) int ac; char **av; { if (ac<4) { printf("ralign file window wordsize mismatches insertions deletions\n"); exit(-1); } /* for SUN systems */ vadvise(VA_ANOM); readin(av[1]); W=atoi(av[2]); wordsize=atoi(av[3]); if (ac>4) mism=atoi(av[4]); if (ac>5) ins=atoi(av[5]); if (ac>6) del=atoi(av[6]); set_arrays(); score_file(); dump_scores(); align(); exit(0); } backcopy(s, d) register int *s, *d; { register int i; for (i=0; i