/* A Program for Identifying Regions Satisfying a Content Requirement Copyright (c) 1992 Xiaoqiu Huang All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to incorporate this software into commercial products may be obtained from the Intellectual Property Office, 1400 Townsend Drive, 301 Administration Building, Houghton, MI 49931, phone (906) 487-3429. IN NO EVENT SHALL THE AUTHOR OR MICHIGAN TECHNOLOGICAL UNIVERSITY BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF MICHIGAN TECHNOLOGICAL UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MICHIGAN TECHNOLOGICAL UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND MICHIGAN TECHNOLOGICAL UNIVERSITY HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. Proper attribution of the author as the source of the software would be appreciated: Huang, Xiaoqiu An Algorithm for Identifying Regions of a DNA Sequence that Satisfy Given Content Requirements, Computer Applications in the Biosciences, 10(3), 219-225, 1994. Xiaoqiu Huang Department of Computer Science Michigan Technological University Houghton, MI 49931 E-mail: huang@cs.mtu.edu LCP (Local Content Program): The program takes a DNA sequence file, a file containing a content requirement, and a length cutoff (integer), and reports the regions of the sequence that satisfy the content requirement. The sequence file contains a plain DNA sequence in lower or upper case, where no other characters are allowed. The content-requirement file contains a SINGLE linear inequality. The following are examples of linear inequalities: C + G > 0.6 (A + C + G + T) C + G > 0.6 N CG > 0.1 RY Here, N = (A+C+G+T), R = (A+G), Y = (C+T). The first two inequalities are equivalent, requiring identification of regions with a C+G content greater than 60%. The third one is equivalent to CG > 0.1 (AC+AT+GC+GT), specifying regions in which the ratio of CG count to AC, AT, GC and GT count is greater than 0.1. The program makes no distinction between upper and lower case letters in an inequality. The eligible letters are A, C, G, T, N, R, and Y. The program is written in C and runs on Sun's and PC's. For description of the algorithm and implementation, see "An algorithm for identifying regions of a DNA sequence that satisfying a content requirement", to appear in CABIOS. Xiaoqiu Huang Department of Computer Science Michigan Technological University Houghton, MI 49931 */ #include #define PLENGTH 6 /* the maximum length of any oligomer */ typedef struct oligo { float num; /* coefficient of an oligomer */ char mer[PLENGTH+2];/* array for holding an oligomer */ struct oligo *next; /* pointer to the next */ } node, *nodeptr; int attr; /* attribute of the next id: 0, null;1, num;2, let;3,(;4,);5,+ */ char nch; /* the next symbol */ float nnum; /* the next number */ char *sym; /* pointer to the next char */ /* A recursive-decent parser for recognizing an expression. Grammar: E -> T | T + E T -> F | FT F -> | | (E) */ main(argc, argv) int argc; char *argv[]; { int M; /* Sequence length */ char *A; /* Storing a sequence */ int symbol; /* The next character */ FILE *Ap, *ckopen(); /* File pointer */ char *ckalloc(); /* space-allocating function */ char *name; /* the name of sequence file */ int i, j, k; /* index variables */ float **weight; /* weight matrix */ int minlen; /* minimum length of any region reported */ float score; /* score of the current position */ int *size; /* size of weight vectors */ int *order; /* order[i] = size[i] / 5 */ int *tlen; /* length of oligomers */ int *value; /* integer value of an oligomer*/ int vert[128]; /* converted digits for 'A','C','G','T' */ int num_tm; /* number of terms in an inequality */ int temp; /* temporary varibles */ char *string; /* constraint expression */ nodeptr list1; /* pointer to a list of positive oligos */ nodeptr list2; /* pointer to a list of negative oligos */ char ine; /* An inequality sign < or > */ int sec; /* position of the expression after >/<. */ nodeptr x; /* pointer */ nodeptr expr(); /* a function to recognize an expression */ int olen[PLENGTH+1]; /* lengths of oligos */ int ser[PLENGTH+1]; /* serial number of an oligo */ float cons; /* total of constant terms in the inequality */ float *S; /* The scoring vector from left */ float *U; /* The scoring vector from right */ int q, r; /* separation positions */ int b, e; /* A[b,e] is a strictly optimal region */ int left, right; for ( i = 0; i < 128; i++ ) vert[i] = 4; vert['A'] = vert['a'] = 0; vert['C'] = vert['c'] = 1; vert['G'] = vert['g'] = 2; vert['T'] = vert['t'] = 3; ine = '='; if ( argc != 4 ) fatalf("The proper form of command is:\n%s sequence_file inequality_file min_length", argv[0]); if ( sscanf(argv[3],"%d", &minlen) != 1 ) fatal("Minimum length integer is missing"); if ( minlen <= 0 ) fatal("The minimum length must be a positive integer"); /* determine the sequence length */ Ap = ckopen(argv[1], "r"); for (M = 0; ( symbol = getc(Ap)) != EOF ; ) if ( symbol != '\n' ) ++M; (void) fclose(Ap); name = argv[1]; /* allocate space for A, S and U */ A = ( char * ) ckalloc( (M + 1) * sizeof(char)); S = ( float * ) ckalloc( (M + 2) * sizeof(float)); U = ( float * ) ckalloc( (M + 2) * sizeof(float)); /* read the sequence into A */ Ap = ckopen(argv[1], "r"); for (M = 0; ( symbol = getc(Ap)) != EOF ; ) if ( symbol != '\n' ) A[++M] = symbol; (void) fclose(Ap); printf("Sequence Name: %s\n", name); printf("Sequence Length: %d\n", M); printf("Region Length Cutoff: %d\n", minlen); printf("Content Requirement:\n"); /* determine the length of the inequality */ Ap = ckopen(argv[2], "r"); for ( k = 0; ( symbol = getc(Ap)) != EOF ; ) { printf("%c", symbol); if ( symbol != ' ' && symbol != '\n' ) switch ( symbol ) { case 'N': k += 9; break; case 'R': case 'Y': k += 5; break; default : k++; break; } } (void) fclose(Ap); string = (char *) ckalloc( (k + 1) * sizeof(char)); /* read the inequality into string one at a time */ Ap = ckopen(argv[2], "r"); for (k = 0; ( symbol = getc(Ap)) != EOF ; ) if ( symbol != ' ' && symbol != '\n' ) switch ( symbol ) { case 'N': sprintf(string+k, "(A+C+G+T)"); k += 9; break; case 'R': sprintf(string+k, "(A+G)"); k += 5; break; case 'Y': sprintf(string+k, "(C+T)"); k += 5; break; case '>': case '<': if ( ine != '=' ) fatal("Can not have more than one occurrence of < or >."); ine = symbol; string[k++] = '\0'; sec = k; break; default : string[k++] = symbol; break; } string[k] = '\0'; k = 0; if ( ine == '=' ) fatal("There is no occurrence of > or <"); sym = ine == '>' ? string : (string + sec); nextid(); list1 = expr(); if ( attr != 0 ) fatal("A left parenthesis may be missing"); sym = ine == '<' ? string : (string + sec); ine = '='; nextid(); list2 = expr(); if ( attr != 0 ) fatal("A left parenthesis may be missing"); for ( i = 0; i <= PLENGTH; i++ ) olen[i] = 0; for ( x = list1; x != NULL; x = x->next ) { for ( i = 0; x->mer[i]; i++ ) ; olen[i] = 1; } for ( x = list2; x != NULL; x = x->next ) { for ( i = 0; x->mer[i]; i++ ) ; olen[i] = 1; } for ( num_tm = 0, i = 1; i <= PLENGTH; i++ ) if ( olen[i] ) num_tm++; weight = ( float ** ) ckalloc( (num_tm) * sizeof(float *)); size = ( int * ) ckalloc( (num_tm) * sizeof(int)); order = ( int * ) ckalloc( (num_tm) * sizeof(int)); tlen = ( int * ) ckalloc( (num_tm) * sizeof(int)); value = ( int * ) ckalloc( (num_tm) * sizeof(int)); for ( j = 0, i = 1; i <= PLENGTH; i++ ) if ( olen[i] ) { ser[i] = j; tlen[j++] = i; } for ( i = 0; i < num_tm; i++ ) { for ( size[i] = 1, j = 1; j <= tlen[i]; j++) size[i] *= 5; order[i] = size[i] / 5; weight[i] = (float *) ckalloc(size[i] * sizeof(float)); for ( j = 0; j < size[i]; j++ ) weight[i][j] = 0.0; } cons = 0.0; for ( x = list1; x != NULL; x = x->next ) if ( x->mer[0] ) { for ( temp = j = 0; x->mer[j]; j++ ) temp = temp * 5 + vert[x->mer[j]]; weight[ser[j]][temp] += x->num; } else cons += x->num; for ( x = list2; x != NULL; x = x->next ) if ( x->mer[0] ) { for ( temp = j = 0; x->mer[j]; j++ ) temp = temp * 5 + vert[x->mer[j]]; weight[ser[j]][temp] -= x->num; } else cons -= x->num; /* scan the DNA sequence */ printf(" Start End Length Score\n"); for ( temp = i = 0; i < num_tm; i++ ) { value[i] = 0; if ( temp < tlen[i] ) temp = tlen[i]; } if ( temp > M ) fatal("The sequence is too short"); score = 0.0; for ( i = 1; i <= M; i++ ) { for ( j = 0; j < num_tm; j++ ) { value[j] = ( value[j] * 5 + vert[A[i]] ) % size[j]; if ( i >= tlen[j] ) score += weight[j][value[j]]; } S[i] = score; if ( score < 0.0 ) score = 0.0; } for ( i = 0; i < temp; i++ ) S[i] = -1; S[M+1] = -1; for ( i = 0; i < num_tm; i++ ) value[i] = 0; score = 0.0; for ( i = M; i >= 1; i-- ) { k = M-i+1; for ( j = 0; j < num_tm; j++ ) { if ( k > tlen[j] ) value[j] = value[j] - vert[A[i+tlen[j]]]; value[j] = value[j] / 5 + vert[A[i]] * order[j]; if ( k >= tlen[j] ) score += weight[j][value[j]]; } U[i] = score; if ( score < 0.0 ) score = 0.0; } for ( i = 0; i < temp; i++ ) U[M+1-i] = -1; U[0] = -1; for ( q = temp-1; q < M+1; ) { for ( r = q-temp+2; S[r+temp-1] >= 0.0 && U[r] >= 0.0; r++ ) ; for ( b = q+1; b < r+temp-1 && S[b-1] > 0.0 ; b++ ) ; if ( b < r+temp-1 ) { for ( e = r-1; e+temp >= b+1 && U[e+1] > 0.0; e-- ) ; if ( e+temp >= b+1 ) { left = b - temp + 1; right = e + temp - 1; if ( right-left+1 >= minlen && S[e]+cons > 0.0 ) printf("%7d %7d %7d %11.2f\n", left, right, right-left+1, S[e]); } } q = r+temp-1; } } nextid() { char digit[100]; /* a sequence of digits */ char *a; if ( *sym == '\0' ) attr = 0; else if ( *sym >= 'A' && *sym <= 'Z' || *sym >= 'a' && *sym <= 'z' ) { nch = *sym++; attr = 2; } else if ( *sym >= '0' && * sym <= '9' || *sym == '.' ) { a = digit; *a++ = *sym++; while ( *sym >= '0' && *sym <= '9' || *sym == '.' ) *a++ = *sym++; *a++ = '\0'; if ( sscanf(digit, "%f", &nnum) != 1 ) fatal("An illegal number"); attr = 1; } else if ( *sym == '(' || *sym == ')' ) { attr = *sym == '(' ? 3 : 4; nch = *sym++; } else if ( *sym == '+' ) { attr = 5; nch = *sym++; } else fatal("An illegal character"); } /* This function returns a linked list of oligo nodes specified by an expression. */ nodeptr expr() { nodeptr t; /* pointer to a list generated by term() */ nodeptr e; /* pointer to a list generated by expr() */ nodeptr x; /* temporary variable */ nodeptr term(); t = term(); if ( attr == 5 ) { nextid(); e = expr(); for ( x = t; x->next != NULL; x = x->next ) ; x->next = e; } return t; } /* This function returns a list derived by a term */ nodeptr term() { nodeptr f; /* pointer to a list generated by factor() */ nodeptr t; /* pointer to a list generated by term() */ nodeptr x, y; /* temporary variables */ nodeptr head; /* pointer to a list from the concatenation of f and t. */ nodeptr temp; /* temporary variable */ char *ckalloc(); /* space-allocation function */ char *a, *b; /* temporary variables */ int i; nodeptr factor(); f = factor(); if ( attr <= 3 && attr >= 1 ) { t = term(); head = NULL; for ( x = f; x != NULL; x = x->next ) for ( y = t; y != NULL; y = y->next ) { temp = (nodeptr) ckalloc(sizeof(node)); temp->num = x->num * y->num; for ( i = 0, a = temp->mer, b = x->mer; *a++ = *b++; i++) ; a--; for ( b = y->mer; *a++ = *b++; ) if ( ++i > PLENGTH ) fatal("An oligomer is too long"); temp->next = head; head = temp; } for ( x = f; x != NULL; x = y ) { y = x->next; free(x); } for ( x = t; x != NULL; x = y ) { y = x->next; free(x); } f = head; } return f; } /* Returns a node representing a number or a letter */ nodeptr factor() { nodeptr x; /* pointer to a node containing an id */ char *ckalloc(); if ( attr == 1 ) { x = (nodeptr) ckalloc(sizeof(node)); x->num = nnum; x->mer[0] = '\0'; x->next = NULL; nextid(); } else if ( attr == 2 ) { x = (nodeptr) ckalloc(sizeof(node)); x->num = 1; x->mer[0] = nch; x->mer[1] = '\0'; x->next = NULL; nextid(); } else if ( attr == 3 ) { nextid(); x = expr(); if ( attr == 4 ) nextid(); else fatal("A right parenthesis is missing"); } else fatal("Syntax error"); return x; } /* lib.c - library of C procedures. */ /* fatal - print message and die */ fatal(msg) char *msg; { (void) fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { (void) fprintf(stderr, msg, val); (void) putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }