/* May, June 1987 - modified for rapid read of database June 2, 1987 - added TFASTA March 30, 1988 - combined ffgetaa, fgetgb; April 8, 1988 - added PIRLIB format for unix Feb 4, 1989 - added universal subroutines for libraries copyright (c) 1987,1988,1989,1992 William R. Pearson getnt.c associated subroutines for matching sequences */ /* 8-April-88 The compile time #define PIRLIB allows this routine to be used to read protein and DNA sequence libraries in the NBRF/PIR VAX/VMS library format. That is: >P1;LCBO This is a line of description GTYH ... the sequence starts on this line This may ease conversion from UWGCG format libraries. It has not been extensively tested. In addition, sequence libraries with a '>' in the 4th position are recognized as NBRF format libraries for consistency with UWGCG February 4, 1988 - this starts a major revision of the getaa routines. The goal is to be able to seach the following format libraries: 0 - normal FASTA format 1 - full Genbank tape format 2 - NBRF/PIR CODATA format 3 - EMBL/Swiss-prot format 4 - Intelligentics format 5 - NBRF/PIR VMS format 11 - NCBI setdb/blastp (1.3.2) AA see file altlib.h to confirm numbers This is done with a new global variable and a requirement for the FASTLIBS file. The FASTLIBS file will now indicate both the sequence type (protein = 0, DNA = 1) and the file format (the numbers shown above, although intelligenetics may become an alternative to Pearson). This will be done by always using a function pointer for getlib and ranlib(), and setting up a bunch of different getlib() and ranlib() functions. Openlib() will be substantially simplified. */ /* Nov 12, 1987 - this version checks to see if the sequence is DNA or protein by asking whether > 85% is A, C, G, T May 5, 1988 - modify the DNA/PROTEIN checker by re-reading DNA sequences in order to check for 'U'. */ #include #include #include #include "uascii.gbl" #ifdef VMS #define PIRLIB #endif #define XTERNAL #include "upam.gbl" #undef XTERNAL #ifndef TRUE #define TRUE 1 #define FALSE 0 #endif #define MAXLINE 512 #define MAXR 15 int lascii[] = {ES, 0, 1, 7, 2, 5, 9,13, 3, 8, 6,12, 10,11,14,15}; #define LAMASK 15 getseq(filen,seq,maxs,dnaseq) char *filen, *seq; int maxs, *dnaseq; { FILE *fptr; char line[512]; int i, j, n; int ic; if ((fptr=fopen(filen,"r"))==NULL) { fprintf(stderr," could not open %s\n",filen); return 0; } n=0; while(fgets(line,512,fptr)!=NULL) { #ifdef PIRLIB if (line[0]=='>'&& (line[3]==';'||line[3]=='>')) fgets(line,512,fptr); else #endif if (line[0]!='>'&& line[0]!=';') { for (i=0; (n 0.85) { *dnaseq = 1; /* convert from protein to DNA sequence */ sascii = nascii; fseek(fptr,0l,0); n=0; while(fgets(line,512,fptr)!=NULL) { #ifdef PIRLIB if (line[0]=='>'&& (line[3]==';'||line[3]=='>')) fgets(line,512,fptr); else #endif if (line[0]!='>'&& line[0]!=';') { for (i=0; (n'|| line[0]==';') goto found; } fclose(fptr); title[0]='\0'; return 0; found: #ifdef PIRLIB if (line[0]=='>'&&(line[3]==';'||line[3]=='>')) { if ((bp = strchr(line,'\n'))!=NULL) *bp='\0'; ll=strlen(line); line[ll++]=' '; line[ll]='\0'; fgets(&line[ll],512-ll,fptr); } #endif #ifdef MSDOS bp = strpbrk(line,"\n\r"); #else bp = strchr(line,'\n'); #endif if (bp!=NULL) *bp = 0; strncpy(title,line,len); title[len-1]='\0'; fclose(fptr); return strlen(title); } #ifndef VMS FILE *libf=NULL; #else int libf = -1; #endif #ifdef NOLIB int leof = 0; #endif long lpos; unsigned char lline[MAXLINE]; #ifndef NOLIB #include "altlib.h" extern int ldnaseq; int (*getlib)(); int (*ranlib)(); #define GETLIB agetlib #define RANLIB aranlib #else #define LASTLIB 10 #define GETLIB getlib #define RANLIB ranlib #endif /* the following is from fgetgb.c */ #include #ifndef O_RAW #ifdef O_BINARY #define O_RAW O_BINARY #else #define O_RAW 0 #endif /* O_BINARY */ #endif /* O_RAW */ int libfd= -1; #ifndef NOLIB extern int deftype; /* default library type */ extern int outtty; /* flag for no interaction */ #ifndef UNIX #define RBSTR "rb" /* read file in binary mode */ #else #define RBSTR "r" #endif #else int deftype=0; int outtty=1; #endif int libtype; /* current open library type */ int sfnum; /* superfamily number from types 0 and 5 */ /* a file name for openlib may now include a library type suffix */ openlib(lname,libenv) char *lname, *libenv; { char rline[10],libn[120], *bp; long ftell(); int wcnt, ll, opnflg; if (lname[0]=='#') return -9; wcnt = 0; #ifndef NOLIB if (strlen(libenv)!=0) { strncpy(libn,libenv,120); #ifdef UNIX strncat(libn,"/",120); #endif strncat(libn,lname,120-strlen(libn)); } else strncpy(libn,lname,120); #else strncpy(libn,lname,120); #endif /* check for library type */ if ((bp=strchr(libn,' '))!=NULL) { *bp='\0'; sscanf(bp+1,"%d",&libtype); if (libtype<0 || libtype >= LASTLIB) { fprintf(stderr," invalid library type: %d (>%d)- resetting\n%s\n", libtype,LASTLIB,lname); libtype=deftype; } } else libtype=deftype; #ifndef NOLIB getlib=getliba[libtype]; ranlib=ranliba[libtype]; l1: if (libtype<=LASTTXT) opnflg=((libf=fopen(libn,"r"))!=NULL); #ifdef NCBIBL13 else if (libtype==NCBIBL13) opnflg=(ncbl_openlib(libn)!= -1); #endif #ifdef SUBSETLIB else if (libtype==SUBLIB) opnflg= subset_openlib(libn); #endif if (!opnflg) { #else l1: if ((libf=fopen(libn,"r"))==NULL) { #endif if (outtty) { fprintf(stderr," cannot open %s library\n",libn); fprintf(stderr," enter new file name or to quit "); fflush(stderr); if (fgets(libn,120,stdin)==NULL) return -1; if ((bp=strchr(libn,'\n'))!=0) *bp='\0'; if (strlen(libn)==0) return 0; if (++wcnt > 10) return -1; goto l1; } else return 0; } #ifndef NOLIB if (libtype<=LASTTXT) { lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return -1; } #else /* NOLIB */ lpos = ftell(libf); if (fgets(lline,MAXLINE,libf)==NULL) return -1; leof = 0; #endif /* NOLIB */ return 1; } #ifdef SUBSETLIB FILEtype setlib(FILEtype newlib) { /* dgg -- this is just to allow use of the xranlib() functions from another code section */ FILEtype oldlib= libf; libf= newlib; return oldlib; } #endif closelib() { if (libf!=NULL) { fclose(libf); libf = NULL; } #ifdef SUBSETLIB subset_closelib(); #endif } GETLIB(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep, *bp; seqp = seq; seqm = &seq[maxs-9]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { #ifndef NOLIB while (lline[0]!='>' && lline[0]!=';') { lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } #ifdef SUPERFAMNUM if ((bp=strchr(lline,SFCHAR))!=NULL) { *bp='\0'; sscanf(bp+1,"%d",&sfnum); } else sfnum=0; #else sfnum = 0; #endif strncpy(libstr,(char*)lline+1,20); libstr[10]='\0'; *libpos = lpos; #else /* NOLIB */ if (leof) return 0; *libpos = lpos; if (lline[0]=='>' || lline[0]==';') { strncpy(libstr,lline+1,20); libstr[10]='\0'; } else { libstr[0]='\0'; strncpy(seqp,lline,(int)(seqm-seqp)); for (cp=seqp; seqpNA) break; } if (*seqp==ES) goto done; } #endif } lline[0]='\0'; while (seqp') goto new; if (*seqp==';') { if (strchr((char*)seqp,'\n')==NULL) goto cont; continue; } for (cp=seqp; seqpNA) break; } if (*seqp==ES) goto done; lpos = ftell(libf); } goto done; new: strncpy((char*)lline,(char*)seqp,MAXLINE); lline[MAXLINE-1]='\0'; if (strchr((char*)seqp,'\n')==NULL) fgets((char*)lline,MAXLINE-strlen((char*)lline),libf); goto done; cont: fgets((char*)lline,MAXLINE,libf); seqm1 = seqp; done: if (seqp>=seqm1) { (*lcont)++; } else { #ifdef NOLIB leof = 1; #endif *lcont=0; } *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; return (int)(seqp-seq); } RANLIB(str,cnt,seek) char *str; int cnt; long seek; { char *bp; int ll; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); if (lline[0]=='>' || lline[0]==';') { strncpy(str,(char*)lline+1,cnt); str[cnt-1]='\0'; #ifdef SUPERFAMNUM if ((bp = strchr(str,SFCHAR))!=NULL) *bp='\0'; else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; else str[cnt-1]='\0'; #else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; else str[cnt-1]='\0'; #endif } else { str[0]='\0'; } #ifdef NOLIB leof=0; #endif } #ifndef NOLIB unsigned char *cpsave; lgetlib(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int i, n, ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep; seqp = seq; seqm = &seq[maxs-11]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { while (lline[0]!='L' || lline[1]!='O' || strncmp((char*)lline,"LOCUS",5)) { /* find LOCUS */ lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } strncpy(libstr,(char*)&lline[11],20); libstr[10]='\0'; *libpos=lpos; while (lline[0]!='O' || lline[1]!='R' || strncmp((char*)lline,"ORIGIN",6)) { /* find ORIGIN */ if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } } else { for (cp= cpsave; seqpNA) break; } } lline[0]='\0'; while (seqpNA) break; } } goto done; new: lpos = ftell(libf); fgets((char*)lline,sizeof(lline),libf); done: if (seqp>=seqm1) { cpsave = cp; (*lcont)++; } else *lcont=0; *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; return (int)(seqp-seq); } lranlib(str,cnt,seek) char *str; int cnt; long seek; { char *bp; int ll; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); strncpy(str,(char*)&lline[12],10); str[10]='\0'; fgets((char*)lline,sizeof(lline),libf); while (lline[0]!='D' || lline[1]!='E' || strncmp((char*)lline,"DEFINITION",10)) fgets((char*)lline,sizeof(lline),libf); strncpy(&str[10],(char*)&lline[11],cnt-10); str[cnt-1]='\0'; if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; fseek(libf,seek,0); fgets((char*)lline,MAXLINE,libf); } pgetlib(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int i, n, ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep; seqp = seq; seqm = &seq[maxs-11]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { while (lline[0]!='E' || lline[1]!='N' || strncmp((char*)lline,"ENTRY",5)) { /* find ENTRY */ lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } strncpy(libstr,(char*)&lline[16],8); libstr[8]='\0'; *libpos = lpos; while (lline[0]!='S' || lline[2]!='Q' || strncmp((char*)lline,"SEQUENCE",8)) { /* find SEQUENCE */ if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } fgets((char*)lline,sizeof(lline),libf); /* get the extra line */ } else { for (cp= cpsave; seqpNA) break; } if (*seqp==ES) goto done; } lline[0]='\0'; while (seqpNA) break; }; if (*seqp==ES) goto done; } goto done; new: lpos = ftell(libf); fgets((char*)lline,sizeof(lline),libf); done: if (seqp>=seqm1) { cpsave = cp; (*lcont)++; } else *lcont=0; *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; return (int)(seqp-seq); } pranlib(str,cnt,seek) char *str; int cnt; long seek; { char *bp; int ll; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); strncpy(str,(char*)&lline[16],8); str[8]='\0'; fgets((char*)lline,sizeof(lline),libf); while (lline[0]!='T' || lline[1]!='I' || strncmp((char*)lline,"TITLE",5)) fgets((char*)lline,sizeof(lline),libf); strncpy(&str[8],(char*)&lline[16],cnt-9); str[cnt-1]='\0'; if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; fseek(libf,seek,0); fgets((char*)lline,MAXLINE,libf); } long seqsiz; egetlib(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep; char id[11]; /* Holds Identifier */ seqp = seq; seqm = &seq[maxs-11]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { while (lline[0]!='I' || lline[1]!='D') { /* find ID */ lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } sscanf((char*)&lline[5],"%s",id); sprintf(libstr,"%-10.10s",id); *libpos = lpos; while (lline[0]!='S' || lline[1]!='Q') { /* find ORIGIN */ if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } sscanf((char*)&lline[14],"%ld",&seqsiz); } else { for (cp= cpsave; seqpNA) break; } if (*seqp==ES) goto done; } lline[0]='\0'; while (seqpNA) break; } if (*seqp==ES) goto done; } goto done; new: lpos = ftell(libf); fgets((char*)lline,sizeof(lline),libf); goto done; done: if (seqp>=seqm1) { cpsave = cp; (*lcont)++; seqsiz -= (long)(seqp-seq); } else *lcont=0; *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; /* if (*lcont==0 && (long)(seqp-seq)!=seqsiz) printf("%s read %d of %d\n",libstr,(int)(seqp-seq),seqsiz); */ return (int)(seqp-seq); } eranlib(str,cnt,seek) char *str; int cnt; long seek; { char *bp; char id[11]; /* Holds Identifier */ int ll; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); sscanf((char*)&lline[5],"%s",id); sprintf(str,"%-10.10s ",id); fgets((char*)lline,sizeof(lline),libf); while (lline[0]!='D' || lline[1]!='E') fgets((char*)lline,sizeof(lline),libf); strncpy(&str[11],(char*)&lline[5],cnt-11); str[cnt-1]='\0'; if ((bp = strchr(str,'\r'))!=NULL) *bp='\0'; if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; fseek(libf,seek,0); fgets((char*)lline,MAXLINE,libf); } igetlib(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int i, n, ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep, *bp; seqp = seq; seqm = &seq[maxs-9]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { while (lline[0]!=';') { lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } *libpos = lpos; while (lline[0]==';') fgets((char*)lline,sizeof(lline),libf); strncpy(libstr,(char*)lline+1,10); libstr[9]='\0'; if((bp=strchr(libstr,'\n'))!=NULL) *bp='\0'; } lline[0]='\0'; while (seqp') goto new; if (*seqp==';') { if (strchr((char*)seqp,'\n')==NULL) goto cont; continue; } for (cp=seqp; seqpNA) break; } if (*seqp==ES) goto done; lpos = ftell(libf); } goto done; new: strncpy((char*)lline,(char*)seqp,MAXLINE); lline[MAXLINE-1]='\0'; if (strchr((char*)seqp,'\n')==NULL) fgets((char*)lline,MAXLINE-strlen((char*)lline),libf); goto done; cont: fgets((char*)lline,MAXLINE,libf); seqm1 = seqp; done: if (seqp>=seqm1) { (*lcont)++; } else { *lcont=0; } *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; return (int)(seqp-seq); } iranlib(str,cnt,seek) char *str; int cnt; long seek; { char *bp; int ll; char tline[120]; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); if (lline[0]=='>' || lline[0]==';') { strncpy(tline,(char*)lline+1,sizeof(tline)); str[cnt-1]='\0'; if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; else str[cnt-1]='\0'; } else { tline[0]='\0'; } while (lline[0]==';') fgets((char*)lline,sizeof(lline),libf); if ((bp=strchr((char*)lline,'\n'))!=NULL) *bp=0; if ((bp=strchr((char*)lline,' '))!=NULL) *bp=0; strncpy(str,(char*)lline,cnt); strncat(str," ",cnt-strlen(str)); strncat(str,tline,cnt-strlen(str)); str[cnt-1]='\0'; fseek(libf,seek,0); fgets((char*)lline,MAXLINE,libf); } vgetlib(seq,maxs,libstr,libpos,lcont) unsigned char *seq; int maxs; char *libstr; long *libpos; int *lcont; { long ftell(); int i, n, ll; int ic; register unsigned char *cp, *seqp; register int *ap; unsigned char *seqm, *seqm1; char *linep, *bp; seqp = seq; seqm = &seq[maxs-9]; seqm1 = seqm-1; #ifndef TFASTA ap = sascii; #else ap = nascii; #endif if (*lcont==0) { while (lline[0]!='>' && lline[0]!=';') { lpos = ftell(libf); if (fgets((char*)lline,MAXLINE,libf)==NULL) return 0; } #ifdef SUPERFAMNUM if ((bp=strchr((char*)lline,SFCHAR))!=NULL) { *bp='\0'; sscanf(bp+1,"%d",&sfnum); } else #endif sfnum=0; if ((bp=strchr((char*)lline,'\n'))!=NULL) *bp='\0'; strncpy(libstr,(char*)&lline[4],20); fgets((char*)lline,MAXLINE,libf); libstr[10]='\0'; *libpos = lpos; } lline[0]='\0'; while (seqp') goto new; if (*seqp==';') { if (strchr((char*)seqp,'\n')==NULL) goto cont; continue; } for (cp=seqp; seqpNA) break; } if (*seqp==ES) goto done; lpos = ftell(libf); } goto done; new: strncpy((char*)lline,(char*)seqp,MAXLINE); lline[MAXLINE-1]='\0'; if (strchr((char*)seqp,'\n')==NULL) fgets((char*)lline,MAXLINE-strlen((char*)lline),libf); goto done; cont: fgets((char*)lline,MAXLINE,libf); seqm1 = seqp; done: if (seqp>=seqm1) { (*lcont)++; } else { *lcont=0; } *seqp = EOSEQ; if ((int)(seqp-seq)==0) return 1; return (int)(seqp-seq); } vranlib(str,cnt,seek) char *str; int cnt; long seek; { char *bp; int ll; fseek(libf, seek, 0); fgets((char*)lline,MAXLINE,libf); if (lline[0]=='>'&&(lline[3]==';'||lline[3]=='>')) { strncpy(str,(char*)&lline[4],cnt); if ((bp = strchr(str,':'))!=NULL) *bp='\0'; if ((bp=strchr(str,'\r'))!=NULL) *bp='\0'; else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0'; else str[cnt-1]='\0'; fgets((char*)lline,MAXLINE,libf); if ((bp=strchr((char*)lline,'\r'))!=NULL) *bp='\0'; if ((bp=strchr((char*)lline,'\n'))!=NULL) *bp='\0'; strncat(str," ",cnt); strncat(str,(char*)lline,cnt-strlen(str)); } else { str[0]='\0'; } fseek(libf,seek,0); fgets((char*)lline,MAXLINE,libf); } #endif /* NOLIB */ scanseq(seq,n,str) char *seq, *str; int n; { int tot,i; char aaray[MAXSQ]; /* this must be set > nsq */ for (i=0; i