Hi bionetters. I got so many requests for the signal sequence program I mentioned that I will repost it. Feel free to use, distribute or archive. Just send me email if you do and cite the program as, "Colgrove, RC, PhD Dissertation, UCSF dept of Biochemistry, 1990." This program implements the Von Heijne algorithm to find potential signal sequences (reference is at the end of the code). It does a very good job at finding real signals, even internal and atypical signals. I have used it to: -provide evidence that some newly cloned gene encode a secreted protein -identify weak of unusual internal signal sequences -identify mutations that would likely affect secretion -examine the statistical distribution of signal sequences in real protein sequences and in random sequences. The program with called with no arguments prints the matrices used and the Von Heijne reference. With one argument, it reads that argument as a file name containing a one-letter amino acid code and prints to the standard output one line for each residue showing the residue number, the one-letter code, and a score for how likely a peptidase site that position would be. At the last line, it prints the position of the best site and its score and tells whether it is a likely signal. To plot a graph of the scores, calling the program with an extra dummy argument (eg signal filename dummyarg) produces output that can be piped to UNIX plot and graph. For example the script: signal $1 s > infile graph -l "signal sequence profile for "$1 < infile | plot -T4014 > outfile cat outfile rm infile outfile makes a nice graph of the signal sequence profile ofthe filename given in $1. I have also included the code for a variant of the program, signal.filtered, which prints a zero score for all poor signal positions. This produces a much prettier graph where only the signals show up as clean spikes. I have only run these programs on UNIX machine but the C code should be portable to anything. Good luck and by all means let me know if you find these useful. -robin RC Colgrove colgrove@xtal10.harvard.edu :::::::::::::::::::::::::::::::::::::::::::::::: signal.c ::::::::::::::::::::::::::::::::::::::::::::::: #include #include #define aminotype "ACDEFGHIKLMNPQRSTVWY" #define SAMPLE 450 #define seqfile argv[1] float expect[] = {14.5,4.5,8.9,10.0,5.6,12.1,3.4,7.4,11.3,12.1, 2.7,7.1,7.4,6.3,7.6,11.4,9.7,11.1,1.8,5.6}; float freq[20][15] = { 16,13,14,15,20,18,18,17,25,15,47,6,80,18,6, 3,6,9,7,9,14,6,8,5,6,19,3,9,8,3, 0,0,0,0,0,0,0,0,5,3,0,5,0,10,11, 0,0,0,1,0,0,0,0,3,7,0,7,0,13,14, 13,9,11,11,6,7,18,13,4,5,0,13,0,6,4, 4,4,3,6,3,13,3,2,19,34,5,7,39,10,7, 0,0,0,0,0,1,1,0,5,0,0,6,0,4,2, 15,15,8,6,11,5,4,8,5,1,10,5,0,8,7, 0,0,0,1,0,0,1,0,0,4,0,2,0,11,9, 71,68,72,79,78,45,64,49,10,23,8,20,1,8,4, 0,3,7,4,1,6,2,2,0,0,0,1,0,1,2, 0,1,0,1,1,0,0,0,3,3,0,10,0,4,7, 2,0,2,0,0,4,1,8,20,14,0,1,3,0,22, 0,0,0,1,0,6,1,0,10,8,0,18,3,19,10, 2,0,0,0,0,1,0,0,7,4,0,15,0,12,9, 9,3,8,6,13,10,15,16,26,11,23,17,20,15,10, 2,10,5,4,5,13,7,7,12,6,17,8,6,3,10, 20,25,15,18,13,15,11,27,0,12,32,3,0,8,17, 4,3,3,1,1,2,5,3,1,3,0,9,0,2,0, 0,1,4,0,0,1,3,1,1,2,0,5,0,1,7 }; main(argc,argv) int argc; char *argv[]; { int i,j,k,shift; float weight[20][15],score[1000],max; FILE *fp, *fopen(); char c, seqbuff[15]; if(argc==1) { printf("\n\t\tSIGNAL SEQUENCE FINDER\n"); printf("\n\nFREQUENCY MATRIX\n\n"); printf("\t"); for(i=0;i<16;i++) { if(i>3) printf(" "); if(i>12) printf(" "); if(i==13) i++; printf("%d ",i-13); } printf("\tEXPECT\n\n"); for(j=0;j<20;j++) { printf("%c\t",aminotype[j]); for(i=0;i<15;i++) printf(" %2.0f ",freq[j][i]); printf("\t%4.1f\n",expect[j]); } } for(j=0;j<20;j++) { for(i=0;i<15;i++) { if(freq[j][i]==0) { freq[j][i]=1; if(i==10||i==12) freq[j][i]=expect[j]/SAMPLE; } weight[j][i]=log(freq[j][i]/expect[j]); } } if(argc==1) { printf("\n\nWEIGHT MATRIX (x 100)\n\n"); printf(" "); for(i=0;i<16;i++) { if(i>3) printf(" "); if(i>12) printf(" "); if(i==13) i++; printf(" %d",i-13); } printf("\n\n"); for(j=0;j<20;j++) { printf("%c ",aminotype[j]); for(i=0;i<15;i++) printf("%4.0f ",100*(weight[j][i])); printf("\n"); } printf("\nmethod from: Gunnar von Heijne, NAR 14, p.4683, 1986\n"); } if(argc!=1) { if((fp=fopen(seqfile,"r"))==NULL) printf("no such file\n"); if(argc==2) printf("\n\nSCORE PROFILE of %s\n\ncleave\tscore\tresidue\n\n",seqfile); while((c=getc(fp))!=EOF&&c!=NULL) { if(c==' '||c=='\t'||c=='\n') continue; seqbuff[14]=c; for(i=0;i<15;i++) for(j=0;j<20;j++) if(seqbuff[i]==aminotype[j]) score[k] += weight[j][i]; if(score[k]>max) { max=score[k]; shift=k; } printf("%d\t%4.2f",k-1,score[k]); if(argc==2) printf("\t%c",seqbuff[12]); printf("\n"); for(i=0;i<14;i++) seqbuff[i]=seqbuff[i+1]; k++; } if(argc==2) { printf("\nmax score is %4.2f, cleaved after res %d\n\n",max,shift-1); if(max<0.0) printf("improbable signal sequence\n"); if(max>=0.0&&max<5.0) printf("possible but ambiguous signal\n"); if(max>=5.0&&max<10.0) printf("probable signal sequence\n"); if(max>=10.0) printf("highly probable signal sequence\n"); } } } ::::::::::::::::::::::::::::::::::::::::::::::::::::::;;; signal.filtered.c :::::::::::::::::::::::::::::::::::::::::::::::::::::::: #include #include #define aminotype "ACDEFGHIKLMNPQRSTVWY" #define SAMPLE 450 #define seqfile argv[1] float expect[] = {14.5,4.5,8.9,10.0,5.6,12.1,3.4,7.4,11.3,12.1, 2.7,7.1,7.4,6.3,7.6,11.4,9.7,11.1,1.8,5.6}; float freq[20][15] = { 16,13,14,15,20,18,18,17,25,15,47,6,80,18,6, 3,6,9,7,9,14,6,8,5,6,19,3,9,8,3, 0,0,0,0,0,0,0,0,5,3,0,5,0,10,11, 0,0,0,1,0,0,0,0,3,7,0,7,0,13,14, 13,9,11,11,6,7,18,13,4,5,0,13,0,6,4, 4,4,3,6,3,13,3,2,19,34,5,7,39,10,7, 0,0,0,0,0,1,1,0,5,0,0,6,0,4,2, 15,15,8,6,11,5,4,8,5,1,10,5,0,8,7, 0,0,0,1,0,0,1,0,0,4,0,2,0,11,9, 71,68,72,79,78,45,64,49,10,23,8,20,1,8,4, 0,3,7,4,1,6,2,2,0,0,0,1,0,1,2, 0,1,0,1,1,0,0,0,3,3,0,10,0,4,7, 2,0,2,0,0,4,1,8,20,14,0,1,3,0,22, 0,0,0,1,0,6,1,0,10,8,0,18,3,19,10, 2,0,0,0,0,1,0,0,7,4,0,15,0,12,9, 9,3,8,6,13,10,15,16,26,11,23,17,20,15,10, 2,10,5,4,5,13,7,7,12,6,17,8,6,3,10, 20,25,15,18,13,15,11,27,0,12,32,3,0,8,17, 4,3,3,1,1,2,5,3,1,3,0,9,0,2,0, 0,1,4,0,0,1,3,1,1,2,0,5,0,1,7 }; main(argc,argv) int argc; char *argv[]; { int i,j,k,shift; float weight[20][15],score[1000],max; FILE *fp, *fopen(); char c, seqbuff[15]; if(argc==1) { printf("\n\t\tSIGNAL SEQUENCE FINDER\n"); printf("\n\nFREQUENCY MATRIX\n\n"); printf("\t"); for(i=0;i<16;i++) { if(i>3) printf(" "); if(i>12) printf(" "); if(i==13) i++; printf("%d ",i-13); } printf("\tEXPECT\n\n"); for(j=0;j<20;j++) { printf("%c\t",aminotype[j]); for(i=0;i<15;i++) printf(" %2.0f ",freq[j][i]); printf("\t%4.1f\n",expect[j]); } } for(j=0;j<20;j++) { for(i=0;i<15;i++) { if(freq[j][i]==0) { freq[j][i]=1; if(i==10||i==12) freq[j][i]=expect[j]/SAMPLE; } weight[j][i]=log(freq[j][i]/expect[j]); } } if(argc==1) { printf("\n\nWEIGHT MATRIX (x 100)\n\n"); printf(" "); for(i=0;i<16;i++) { if(i>3) printf(" "); if(i>12) printf(" "); if(i==13) i++; printf(" %d",i-13); } printf("\n\n"); for(j=0;j<20;j++) { printf("%c ",aminotype[j]); for(i=0;i<15;i++) printf("%4.0f ",100*(weight[j][i])); printf("\n"); } printf("\nmethod from: Gunnar von Heijne, NAR 14, p.4683, 1986\n"); } if(argc!=1) { if((fp=fopen(seqfile,"r"))==NULL) printf("no such file\n"); if(argc==2) printf("\n\nSCORE PROFILE of %s\n\ncleave\tscore\tresidue\n\n",seqfile); while((c=getc(fp))!=EOF&&c!=NULL) { if(c==' '||c=='\t'||c=='\n') continue; seqbuff[14]=c; for(i=0;i<15;i++) for(j=0;j<20;j++) if(seqbuff[i]==aminotype[j]) score[k] += weight[j][i]; if(score[k]>max) { max=score[k]; shift=k; } if(k>1) { if(argc>2&&score[k]<0) printf("%d\t0",k-1); else { if(argc>2) printf("%4.2f\t0\n",k-1.25); printf("%d\t%4.2f", k-1, score[k]); if(argc>2) printf("\n%4.2f\t0",k-0.75); } } if(argc==2) printf("\t%c",seqbuff[12]); printf("\n"); for(i=0;i<14;i++) seqbuff[i]=seqbuff[i+1]; k++; } if(argc==2) { printf("\nmax score is %4.2f, cleaved after res %d\n\n",max,shift-1); if(max<0.0) printf("improbable signal sequence\n"); if(max>=0.0&&max<5.0) printf("possible but ambiguous signal\n"); if(max>=5.0&&max<10.0) printf("probable signal sequence\n"); if(max>=10.0) printf("highly probable signal sequence\n"); } else { printf("%d\t0\t\" \"\n",k-1); printf("%d\t%4.2f\t\\n",shift-1,max,shift-1,max); printf("%d\t2.5\tpossible\n",k); printf("%d\t7.5\tprobable\n",k); printf("%d\t12.5\tdefinite\n",k); } } }