#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	readseqd
# This archive created: Tue Feb  2 11:15:15 1993
export PATH; PATH=/bin:$PATH
if test ! -d 'readseqd'
then
	echo shar: creating directory "'readseqd'"
	mkdir 'readseqd'
fi
echo shar: entering directory "'readseqd'"
cd 'readseqd'
echo shar: extracting "'readseq.c'" '(35592 characters)'
if test -f 'readseq.c'
then
	echo shar: will not over-write existing file "'readseq.c'"
else
cat << \SHAR_EOF > 'readseq.c'
/* File: readseq.c
 * main() program for ureadseq.c, ureadseq.h
 *
 * Reads and writes nucleic/protein sequence in various
 * formats. Data files may have multiple sequences.
 *
 * Copyright 1990 by d.g.gilbert
 * biology dept., indiana university, bloomington, in 47405
 * e-mail: gilbertd@bio.indiana.edu
 *
 * This program may be freely copied and used by anyone.
 * Developers are encourged to incorporate parts in their
 * programs, rather than devise their own private sequence
 * format.
 *
 * This should compile and run with any ANSI C compiler.
 * Please advise me of any bugs, additions or corrections.
 *
 */

const char *title
    = "readSeq (1Feb93), multi-format molbio sequence reader.\n";

 /*  History
  27 Feb 90.  1st release to public.
   4 Mar 90.  + Gary Olsen format
              + case change
              * minor corrections to NBRF,EMBL,others
              * output 1 file per sequence for gcg, unknown
              * define -DNOSTR for c-libraries w/o strstr
              - readseq.p, pascal version, becomes out-of-date
  24 May 90.  + Phylip 3.2 output format (no input)
  20 Jul 90.  + Phylip 3.3 output (no input yet)
              + interactive output re-direction
              + verbose progress info
              * interactive help output
              * dropped line no.s on NBRF output
              * patched in HyperGCG XCMD corrections,
                - except for seq. documentation handling
              * dropped the IG special nuc codes, as IG has
                adopted the standard IUB codes (now if only
                everyone would adopt a standard format !)
  11 Oct 90.  * corrected bug in reading/writing of EMBL format

  17 Oct 91.  * corrected bug in reading Olsen format
                (serious-deletion)
  10 Nov 91.  * corrected bug in reading some GCG format files
                (serious-last line duplicated)
              + add format name parsing (-fgb, -ffasta, ...)
              + Phylip v3.4 output format (== v3.2, sequential)
              + add checksum output to all forms that have document
              + skip mail headers in seq file
              + add pipe for standard input == seq file (with -p)
              * fold in parts of MacApp Seq object
              * strengthen format detection
              * clarify program structure
              * remove fixed sequence size limit (now dynamic, sizeof memory)
              * check and fold in accumulated bug reports:
              *   Now ANSI-C fopen(..,"w") & check open failure
              *   Define -DFIXTOUPPER for nonANSI C libraries that mess
                  up toupper/tolower
              = No command-line changes; callers of readseq main() should be okay
              - ureadseq.h functions have changed; client programs need to note.
              + added Unix and VMS Make scripts, including validation tests

   4 May 92.  + added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
                (-DBIGCHECKSUM)
    Aug 92    = fixed Olsen format input to handle files w/ more sequences,
                not to mess up when more than one seq has same identifier,
                and to convert number masks to symbols.
              = IG format fix to understand ^L

  25-30 Dec 92
              * revised command-line & interactive interface.  Suggested form is now
                  readseq infile -format=genbank -output=outfile -item=1,3,4 ...
                but remains compatible with prior commandlines:
                  readseq infile -f2 -ooutfile -i3 ...
              + added GCG MSF multi sequence file format
              + added PIR/CODATA format
              + added NCBI ASN.1 sequence file format
              + added Pretty, multi sequence pretty output (only)
              + added PAUP multi seq format
              + added degap option
              + added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
              + added support for reading Phylip formats (interleave & sequential)
              * string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
              * changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version

   1Feb93
              = revert GenBank output to a fixed left number width which 
               other software depends on.
	      = fix for MSF input to handle symbols in names
	      = fix bug for possible memory overrun when truncating seqs for
		Phylip or Paup formats (thanks Anthony Persechini)

 */



/*
   Readseq has been tested with:
      Macintosh MPW C
      GNU gcc
      SGI cc
      VAX-VMS cc
   Any ANSI C compiler should be able to handle this.
   Old-style C compilers barf all over the source.


How do I build the readseq program if I have an Ansi C compiler?
#--------------------
# Unix ANSI C
# Use the supplied Makefile this way:
%  make CC=name-of-c-compiler
# OR do this...
% gcc readseq.c ureadseq.c -o readseq

#--------------------
$!VAX-VMS cc
$! Use the supplied Make.Com this way:
$  @make
$! OR, do this:
$ cc readseq, ureadseq
$ link readseq, ureadseq, sys$library:vaxcrtl/lib
$ readseq :== $ MyDisk:[myacct]readseq

#--------------------
# Macintosh Simple Input/Output Window application
# requires MPW-C and SIOW library (from APDA)
# also uses files macinit.c, macinit.r, readseqSIOW.make
#
Buildprogram readseqSIOW

#--------------------
#MPW-C v3 tool
C  ureadseq.c
C  readseq.c
link -w -o readseq -t MPST -c 'MPS ' ¶
   readseq.c.o Ureadseq.c.o ¶
    "{Libraries}"Interface.o ¶
    "{Libraries}"ToolLibs.o ¶
    "{Libraries}"Runtime.o ¶
    "{CLibraries}"StdClib.o
readseq -i1 ig.seq

# MPW-C with NCBI tools

set NCBI "{Boot}@molbio:ncbi:"; EXPORT NCBI
set NCBILIB1  "{NCBI}"lib:libncbi.o; export NCBILIB1
set NCBILIB2  "{NCBI}"lib:libncbiobj.o; export NCBILIB2
set NCBILIB3  "{NCBI}"lib:libncbicdr.o; export NCBILIB3
set NCBILIB4  "{NCBI}"lib:libvibrant.o; export NCBILIB4

C  ureadseq.c
C  -d NCBI -i "{NCBI}"include: ureadasn.c
C  -d NCBI -i "{NCBI}"include: readseq.c
link -w -o readseq -t MPST -c 'MPS ' ¶
   ureadseq.c.o ureadasn.c.o readseq.c.o  ¶
    {NCBILIB4} {NCBILIB2} {NCBILIB1} ¶
    "{Libraries}"Interface.o ¶
    "{Libraries}"ToolLibs.o ¶
    "{Libraries}"Runtime.o ¶
    "{CLibraries}"CSANELib.o ¶
    "{CLibraries}"Math.o ¶
    "{CLibraries}"StdClib.o

===========================================================*/



#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "ureadseq.h"

#pragma segment readseq



static char inputfilestore[256], *inputfile = inputfilestore;

const char *formats[kMaxFormat+1] = {
    " 1. IG/Stanford",
    " 2. GenBank/GB",
    " 3. NBRF",
    " 4. EMBL",
    " 5. GCG",
    " 6. DNAStrider",
    " 7. Fitch",
    " 8. Pearson/Fasta",
    " 9. Zuker (in-only)",
    "10. Olsen (in-only)",
    "11. Phylip3.2",
    "12. Phylip",
    "13. Plain/Raw",
    "14. PIR/CODATA",
    "15. MSF",
    "16. ASN.1",
    "17. PAUP/NEXUS",
    "18. Pretty (out-only)",
    "" };

#define kFormCount  30
#define kMaxFormName 15

const  struct formatTable {
  char  *name;
  short num;
  } formname[] = {
    {"ig",  kIG},
    {"stanford", kIG},
    {"genbank", kGenBank},
    {"gb", kGenBank},
    {"nbrf", kNBRF},
    {"embl", kEMBL},
    {"gcg", kGCG},
    {"uwgcg", kGCG},
    {"dnastrider", kStrider},
    {"strider", kStrider},
    {"fitch", kFitch},
    {"pearson", kPearson},
    {"fasta", kPearson},
    {"zuker", kZuker},
    {"olsen", kOlsen},
    {"phylip", kPhylip},
    {"phylip3.2", kPhylip2},
    {"phylip3.3", kPhylip3},
    {"phylip3.4", kPhylip4},
    {"phylip-interleaved", kPhylip4},
    {"phylip-sequential", kPhylip2},
    {"plain", kPlain},
    {"raw", kPlain},
    {"pir", kPIR},
    {"codata", kPIR},
    {"asn.1", kASN1},
    {"msf", kMSF},
    {"paup", kPAUP},
    {"nexus", kPAUP},
    {"pretty", kPretty},
  };

const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n";

/* GWW table for getting the complement of a nucleotide (IUB codes) */
/*                     ! "#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \]^_`abcdefghijklmnopqrstuvwxyz{|}~ */
const char compl[] = " !\"#$%&'()*+,-./0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`tvghnncdnnmnknnyrysaabwnrn{|}~";



char *formatstr( short format)
{
  if (format < 1 || format > kMaxFormat) {
    switch (format) {
      case kASNseqentry :
      case kASNseqset   : return formats[kASN1-1];
      case kPhylipInterleave:
      case kPhylipSequential: return formats[kPhylip-1];
      default: return "(unknown)";
      }
    }
  else return formats[format-1];
}

int parseformat( char *name)
{
#define kDupmatch  -2
  int   namelen, maxlen, i, match, matchat;
  char  lname[kMaxFormName+1];

  skipwhitespace(name);
  namelen = strlen(name);
  if (namelen == 0)
    return kNoformat;
  else if (isdigit(*name)) {
    i = atol( name);
    if (i < kMinFormat | i > kMaxFormat) return kNoformat;
    else return i;
    }

  /* else match character name */
  maxlen = min( kMaxFormName, namelen);
  for (i=0; i<maxlen; i++) lname[i] = to_lower(name[i]);
  lname[maxlen]=0;
  matchat = kNoformat;

  for (i=0; i<kFormCount; i++) {
    match = strncmp( lname, formname[i].name, maxlen);
    if (match == 0) {
      if (strlen(formname[i].name) == namelen) return (formname[i].num);
      else if (matchat == kNoformat) matchat = i;
      else matchat = kDupmatch; /* 2 or more partial matches */
      }
    }
  if (matchat == kNoformat || matchat == kDupmatch)
    return kNoformat;
  else
    return formname[matchat].num;
}



static void dumpSeqList(char *list, short format)
{
  long i, l, listlen;
  char s[256];

  listlen = strlen(list);
  printf("Sequences in %s  (format is %s)\n", inputfile, formatstr(format));
  for (i=0, l=0; i < listlen; i++) {
    if (list[i] == (char)NEWLINE) {
      s[l] = '\0'; l = 0;
      puts(s);
      }
    else if (l < 255)
      s[l++] = list[i];
    }
  putchar('\n');
}



void usage()
{
  short   i, midi;

  fprintf(stderr,title);
  fprintf(stderr,
  "usage: readseq [-options] in.seq > out.seq\n");
  fprintf(stderr," options\n");
/* ? add -d[igits] to allow digits in sequence data, &/or option to specify seq charset !? */
  fprintf(stderr, "    -a[ll]         select All sequences\n");
  fprintf(stderr, "    -c[aselower]   change to lower case\n");
  fprintf(stderr, "    -C[ASEUPPER]   change to UPPER CASE\n");
  fprintf(stderr, "    -degap[=-]     remove gap symbols\n");
  fprintf(stderr, "    -i[tem=2,3,4]  select Item number(s) from several\n");
  fprintf(stderr, "    -l[ist]        List sequences only\n");
  fprintf(stderr, "    -o[utput=]out.seq  redirect Output\n");
  fprintf(stderr, "    -p[ipe]        Pipe (command line, <stdin, >stdout)\n");
  fprintf(stderr, "    -r[everse]     change to Reverse-complement\n");
  fprintf(stderr, "    -v[erbose]     Verbose progress\n");
  fprintf(stderr, "    -f[ormat=]#    Format number for output,  or\n");
  fprintf(stderr, "    -f[ormat=]Name Format name for output:\n");
  midi = (kMaxFormat+1) / 2;
  for (i = kMinFormat-1; i < midi; i++)
   fprintf( stderr, "        %-20s      %-20s\n",
    formats[i], formats[midi+i]);

  /* new output format options, esp. for pretty format: */
  fprintf(stderr, "     \n");
  fprintf(stderr, "   Pretty format options: \n");
  fprintf(stderr, "    -wid[th]=#            sequence line width\n");
  fprintf(stderr, "    -tab=#                left indent\n");
  fprintf(stderr, "    -col[space]=#         column space within sequence line on output\n");
  fprintf(stderr, "    -gap[count]           count gap chars in sequence numbers\n");
  fprintf(stderr, "    -nameleft, -nameright[=#]   name on left/right side [=max width]\n");
  fprintf(stderr, "    -nametop              name at top/bottom\n");
  fprintf(stderr, "    -numleft, -numright   seq index on left/right side\n");
  fprintf(stderr, "    -numtop, -numbot      index on top/bottom\n");
  fprintf(stderr, "    -match[=.]            use match base for 2..n species\n");
  fprintf(stderr, "    -inter[line=#]        blank line(s) between sequence blocks\n");

  /******  not ready yet
  fprintf(stderr, "    -code=none,rtf,postscript,ps   code syntax\n");
  fprintf(stderr, "    -namefont=, -numfont=, -seqfont=font   font choice\n");
  fprintf(stderr, "       font suggestions include times,courier,helvetica\n");
  fprintf(stderr, "    -namefontsize=, -numfontsize=, -seqfontsize=#\n");
  fprintf(stderr, "       fontsize suggestions include 9,10,12,14\n");
  fprintf(stderr, "    -namefontstyle=, -numfontstyle=, -seqfontstyle= style  fontstyle for names\n");
  fprintf(stderr, "       fontstyle options are plain,italic,bold,bold-italic\n");
  ******/
}

void erralert(short err)
{
  switch (err) {
    case 0  :
      break;
    case eFileNotFound: fprintf(stderr, "File not found: %s\n", inputfile);
      break;
    case eFileCreate: fprintf(stderr, "Can't open output file.\n");
      break;
    case eASNerr: fprintf(stderr, "Error in ASN.1 sequence routines.\n");
      break;
    case eNoData: fprintf(stderr, "No data in file.\n");
      break;
    case eItemNotFound: fprintf(stderr, "Specified item not in file.\n");
      break;
    case eUnequalSize:  fprintf(stderr,
      "This format requires equal length sequences.\nSequence truncated or padded to fit.\n");
      break;
    case eUnknownFormat: fprintf(stderr, "Error: this format is unknown to me.\n");
      break;
    case eOneFormat: fprintf(stderr,
      "Warning: This format permits only 1 sequence per file.\n");
      break;
    case eMemFull: fprintf(stderr, "Out of storage memory. Sequence truncated.\n");
      break;
    default: fprintf(stderr, "readSeq error = %d\n", err);
      break;
    }
} /* erralert */


int chooseFormat( boolean quietly)
{
  char  sform[128];
  int   midi, i, outform;

    if (quietly)
      return kPearson;  /* default */
    else {
      midi = (kMaxFormat+1) / 2;
      for (i = kMinFormat-1; i < midi; i++)
        fprintf( stderr, "        %-20s      %-20s\n",
                        formats[i], formats[midi+i]);
      fprintf(stderr,"\nChoose an output format (name or #): \n");
      gets(sform);
      outform = parseformat(sform);
      if (outform == kNoformat) outform = kPearson;
      return outform;
      }
}



/* read paramater(s) */

boolean checkopt( boolean casesense, char *sopt, const char *smatch, short minword)
{
  long  lenopt, lenmatch;
  boolean result;
  short minmaxw;

  lenopt = strlen(sopt);
  lenmatch= strlen(smatch);
  minmaxw= max(minword, min(lenopt, lenmatch));

  if (casesense)
    result= (!strncmp( sopt, smatch, minmaxw));
  else
    result= (!Strncasecmp( sopt, smatch, minmaxw ));
  /* if (result) { */
    /* fprintf(stderr,"true checkopt(opt=%s,match=%s,param=%s)\n", sopt, smatch, *sparam); */
  /*  } */
  return result;
}


#define   kMaxwhichlist  50

/* global for readopt(), main() */
boolean   chooseall = false, quietly = false, gotinputfile = false,
          listonly = false, closeout = false, verbose = false,
          manyout = false, dolower = false, doupper = false, doreverse= false,
          askout  = true, dopipe= false, interleaved = false;
short     nfile = 0, iwhichlist=0, nwhichlist = 0;
short     whichlist[kMaxwhichlist+1];
long      whichSeq = 0, outform = kNoformat;
char      onamestore[128], *oname = onamestore;
FILE      *foo = NULL;

void resetGlobals()
/* need this when used from SIOW, as these globals are not reinited automatically
between calls to local main() */
{
  chooseall = false; quietly = false; gotinputfile = false;
  listonly = false; closeout = false; verbose = false;
  manyout = false; dolower = false; doupper = false; doreverse= false;
  askout  = true; dopipe= false; interleaved = false;
  nfile = 0; iwhichlist=0; nwhichlist = 0;
  whichSeq = 0; outform = kNoformat;
  oname = onamestore;
  foo = NULL;

  gPrettyInit(gPretty);
}


#define kOptOkay  1
#define kOptNone  0

int readopt( char *sopt)
{
  char    sparamstore[256], *sparam= sparamstore;
  short   n, slen= strlen(sopt);

  /* fprintf(stderr,"readopt( %s) == ", sopt); */

  if (*sopt == '?') {
    usage();
    return kOptNone;   /*? eOptionBad or kOptNone */
    }

  else if (*sopt == '-') {

    char *cp= strchr(sopt,'=');
    *sparam= '\0';
    if (cp) {
      strcpy(sparam, cp+1);
      *cp= 0;
      }

    if (checkopt( false, sopt, "-help", 2)) {
      usage();
      return kOptNone;
      }

    if (checkopt( false, sopt, "-all", 2)) {
      whichSeq= 1; chooseall= true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-colspace", 4)) { /* test before -c[ase] */
      n= atoi( sparam);
      gPretty.spacer = n;
      return kOptOkay;
      }

    if (checkopt( true, sopt, "-caselower", 2)) {
      dolower= true;
      return kOptOkay;
      }
    if (checkopt( true, sopt, "-CASEUPPER", 2)) {
      doupper= true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-pipe", 2)) {
      dopipe= true; askout= false;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-list", 2)) {
      listonly = true; askout = false;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-reverse", 2)) {
      doreverse = true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-verbose", 2)) {
      verbose = true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-match", 5)) {
      gPretty.domatch= true;
      if (*sparam >= ' ') gPretty.matchchar= *sparam;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-degap", 4)) {
      gPretty.degap= true;
      if (*sparam >= ' ') gPretty.gapchar= *sparam;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-interline", 4)) {
      gPretty.interline= atoi( sparam);
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-item", 2)) {
      char  *cp = sparam;
      nwhichlist= 0;
      whichlist[0]= 0;
      if (*cp == 0) cp= sopt+2; /* compatible w/ old way */
      do {
        while (*cp!=0 && !isdigit(*cp)) cp++;
        if (*cp!=0) {
          n = atoi( cp);
          whichlist[nwhichlist++]= n;
          while (*cp!=0 && isdigit(*cp)) cp++;
          }
      } while (*cp!=0 && n>0 && nwhichlist<kMaxwhichlist);
      whichlist[nwhichlist++]= 0; /* 0 == stopsign for loop */
      whichSeq= max(1,whichlist[0]); iwhichlist= 1;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-format", 5)) {/* -format=phylip, -f2, -form=phylip */
      if (*sparam==0) { for (sparam= sopt+2; isalpha(*sparam); sparam++) ; }
      outform = parseformat( sparam);
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-f", 2)) { /* compatible w/ -fphylip prior version */
      if (*sparam==0) sparam= sopt+2;
      outform = parseformat( sparam);
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-output", 3)) {/* -output=myseq */
      if (*sparam==0) { for (sparam= sopt+3; isalpha(*sparam); sparam++) ; }
      strcpy( oname, sparam);
      foo = fopen( oname, "w");
      if (!foo) { erralert(eFileCreate); return eFileCreate; }
      closeout = true;
      askout = false;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-o", 2)) {  /* compatible w/ -omyseq prior version */
      if (*sparam==0) sparam= sopt+2;
      strcpy( oname, sparam);
      foo = fopen( oname, "w");
      if (!foo) { erralert(eFileCreate); return eFileCreate; }
      closeout = true;
      askout = false;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-width", 2)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      if (n>0) gPretty.seqwidth = n;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-tab", 4)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      gPretty.tab = n;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-gapcount", 4)) {
      gPretty.baseonlynum = false;
      /* if (*sparam >= ' ') gPretty.gapchar= *sparam; */
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-nointerleave", 8)) {
      gPretty.noleaves = true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-nameleft", 7)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      if (n>0 && n<50) gPretty.namewidth =  n;
      gPretty.nameleft= true;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-nameright", 7)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      if (n>0 && n<50) gPretty.namewidth =  n;
      gPretty.nameright= true;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-nametop", 6)) {
      gPretty.nametop= true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-numleft", 6)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      if (n>0 && n<50) gPretty.numwidth =  n;
      gPretty.numleft= true;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-numright", 6)) {
      if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
      n= atoi( sparam);
      if (n>0 && n<50) gPretty.numwidth =  n;
      gPretty.numright= true;
      return kOptOkay;
      }

    if (checkopt( false, sopt, "-numtop", 6)) {
      gPretty.numtop= true;
      return kOptOkay;
      }
    if (checkopt( false, sopt, "-numbottom", 6)) {
      gPretty.numbot= true;
      return kOptOkay;
      }

    else {
      usage();
      return eOptionBad;
      }
    }

  else {
    strcpy( inputfile, sopt);
    gotinputfile = (*inputfile != 0);
    nfile++;
    return kOptOkay;
    }

 /* return kOptNone; -- never here */
}




/* this program suffers some as it tries to be a quiet translator pipe
   _and_ a noisy user interactor
*/

/* return is best for SIOW, okay for others */
#ifdef SIOW
#define Exit(a)   return(a)
siow_main( int argc, char *argv[])

#else
#define Exit(a)   exit(a)

main( int argc, char *argv[])
#endif
{
boolean   closein = false;
short     ifile, nseq, atseq, format, err = 0, seqtype = kDNA,
          nlines, seqout = 0, phylvers = 2;
long      i, skiplines, seqlen, seqlen0;
unsigned long  checksum= 0, checkall= 0;
char      *seq, *cp, *firstseq = NULL, *seqlist, *progname, tempname[256];
char      seqid[256], *seqidptr = seqid;
char      stempstore[256], *stemp = stempstore;
FILE      *ftmp, *fin, *fout;
long      outindexmax= 0, noutindex= 0, *outindex = NULL;

#define exit_main(err) {        \
  if (closeout) fclose(fout);   \
  if (closein) fclose(fin);   \
  if (*tempname!=0) remove(tempname);\
  Exit(err); }

#define indexout()  if (interleaved) {\
  if (noutindex>=outindexmax) {\
    outindexmax= noutindex + 20;\
    outindex= (long*) realloc(outindex, sizeof(long)*outindexmax);\
    if (outindex==NULL) { err= eMemFull; erralert(err); exit_main(err); }\
    }\
  outindex[noutindex++]= ftell(fout);\
  }


  resetGlobals();
  foo = stdout;
  progname = argv[0];
  *oname = 0;
  *tempname = 0;
  /* initialize gPretty ?? -- done in header */

  for (i=1; i < argc; i++) {
    err= readopt( argv[i]);
    if (err <= 0) exit_main(err);
    }

                            /* pipe input from stdin !? */
  if (dopipe && !gotinputfile) {
    int c;
    tmpnam(tempname);
    inputfile = tempname;
    ftmp = fopen( inputfile, "w");
    if (!ftmp) { erralert(eFileCreate); exit_main(eFileCreate); }
    while ((c = getc(stdin)) != EOF) fputc(c, ftmp);
    fclose(ftmp);
    gotinputfile= true;
    }

  quietly = (dopipe || (gotinputfile && (listonly || whichSeq != 0)));

  if (verbose || (!quietly && !gotinputfile)) fprintf( stderr, title);
  ifile = 1;

                            /* UI: Choose output */
  if (askout && !closeout && !quietly) {
    askout = false;
    fprintf(stderr,"\nName of output file (?=help, defaults to display): \n");
    gets(oname= onamestore);
    skipwhitespace(oname);
    if (*oname == '?') { usage(); exit_main(0); }
    else if (*oname != 0) {
      closeout = true;
      foo = fopen( oname, "w");
      if (!foo) { erralert(eFileCreate); exit_main(eFileCreate); }
      }
    }

  fout = foo;
  if (outform == kNoformat) outform = chooseFormat(quietly);

                          /* set up formats ... */
  switch (outform) {
    case kPhylip2:
      interleaved= false;
      phylvers = 2;
      outform = kPhylip;
      break;

    case kPhylip4:
      interleaved= true;
      phylvers = 4;
      outform = kPhylip;
      break;

    case kMSF:
    case kPAUP:
      interleaved= true;
      break;

    case kPretty:
      gPretty.isactive= true;
      interleaved= true;
      break;

    }

  if (gPretty.isactive && gPretty.noleaves) interleaved= false;
  if (interleaved) {
    fout = ftmp = tmpfile();
    outindexmax= 30; noutindex= 0;
    outindex = (long*) malloc(outindexmax*sizeof(long));
    if (outindex==NULL) { err= eMemFull; erralert(err); exit_main(err); }
    }

                        /* big loop over all input files */
  do {
                        /* select next input file */
    gotinputfile = (*tempname != 0);
    while ((ifile < argc) && (!gotinputfile)) {
      if (*argv[ifile] != '-') {
        strcpy( inputfile, argv[ifile]);
        gotinputfile = (*inputfile != 0);
        --nfile;
        }
      ifile++;
      }

    while (!gotinputfile) {
      fprintf(stderr,"\nName an input sequence or -option: \n");
      inputfile= inputfilestore;

      gets(stemp= stempstore);
      if (*stemp==0) goto fini;  /* !! need this to finish work during interactive use */
      stemp= strtok(stempstore, " \n\r\t");
      while (stemp) {
        err= readopt( stemp); /* will read inputfile if it exists */
        if (err<0) exit_main(err);
        stemp= strtok( NULL, " \n\r\t");
        }
      }
              /* thanks to AJB@UK.AC.DARESBURY.DLVH for this PHYLIP3 fix: */
              /* head for end (interleave if needed) */
    if (*inputfile == 0) break;

    format = seqFileFormat( inputfile, &skiplines, &err);

    if (err == 0)  {
#ifdef NCBI
      if (format == kASNseqentry || format == kASNseqset)
        seqlist = listASNSeqs( inputfile, skiplines, format, &nseq, &err);
      else
#endif
        seqlist = listSeqs( inputfile, skiplines, format, &nseq, &err);
      }

    if (err != 0)
      erralert(err);

    else if (listonly) {
      dumpSeqList(seqlist,format);
      free( seqlist);
      }

    else {
                                /* choose whichSeq if needed */
      if (nseq == 1 || chooseall || (quietly && whichSeq == 0)) {
        chooseall= true;
        whichSeq = 1;
        quietly = true; /* no loop */
        }
      else if (whichSeq > nseq && quietly) {
        erralert(eItemNotFound);
        err= eItemNotFound;
        }
      else if (whichSeq > nseq || !quietly) {
        dumpSeqList(seqlist, format);
        fprintf(stderr,"\nChoose a sequence (# or All): \n");
        gets(stemp= stempstore);
        skipwhitespace(stemp);
        if (to_lower(*stemp) == 'a') {
          chooseall= true;
          whichSeq = 1;
          quietly = true; /* !? this means we don't ask for another file 
                            as well as no more whichSeqs... */
          }
        else if (isdigit(*stemp)) whichSeq= atol(stemp);
        else whichSeq= 1; /* default */
        }
      free( seqlist);

      if (false /*chooseall*/) {  /* this isn't debugged yet...*/
        fin = fopen(inputfile, "r");
        closein= true;
        }

      while (whichSeq > 0 && whichSeq <= nseq) {
                                /* need to open multiple output files ? */
        manyout = ((chooseall || nwhichlist>1) && nseq > 1
                  && (outform == kPlain || outform == kGCG));
        if (manyout) {
          if ( whichSeq == 1 ) erralert(eOneFormat);
          else if (closeout) {
            sprintf( stemp,"%s_%d", oname, whichSeq);
            freopen( stemp, "w", fout);
            fprintf( stderr,"Writing sequence %d to file %s\n", whichSeq, stemp);
            }
          }

        if (closein) {
          /* !! this fails... skips most seqs... */
          /* !! in sequential read, must count seqs already read from whichSeq ... */
          /* need major revision of ureadseq before we can do this */
          atseq= whichSeq-1;
          seqidptr= seqid;
          seq = readSeqFp( whichSeq, fin, skiplines, format,
                          &seqlen, &atseq, &err, seqidptr);
          skiplines= 0;
          }
        else {
          atseq= 0;
          seqidptr= seqid;
#ifdef NCBI
          if (format == kASNseqentry || format == kASNseqset) {
            seqidptr= NULL;
            seq = readASNSeq( whichSeq, inputfile, skiplines, format,
                     &seqlen, &atseq, &err, &seqidptr);
            }
          else
#endif
          seq = readSeq( whichSeq, inputfile, skiplines, format,
                          &seqlen, &atseq, &err, seqidptr);
          }


        if (gPretty.degap) {
          char *newseq;
          long newlen;
          newseq= compressSeq( gPretty.gapchar, seq, seqlen, &newlen);
          if (newseq) {
            free(seq); seq= newseq; seqlen= newlen;
            }
          }

        if (outform == kMSF) checksum= GCGchecksum(seq, seqlen, &checkall);
        else if (verbose) checksum= seqchecksum(seq, seqlen, &checkall);
        if (verbose)
          fprintf( stderr, "Sequence %d, length= %d, checksum= %X, format= %s, id= %s\n",
                whichSeq, seqlen, checksum, formatstr(format), seqidptr);

        if (err != 0) erralert(err);
        else {
                                  /* format fixes that writeseq doesn't do */
          switch (outform) {
            case kPIR:
              if (seqout == 0) fprintf( foo,"\\\\\\\n");
              break;
            case kASN1:
              if (seqout == 0) fprintf( foo, kASN1headline);
              break;

            case kPhylip:
              if (seqout == 0) {
                if (!interleaved) {  /*  bug, nseq is for 1st infile only */
                  if (chooseall) i= nseq; else i=1;
                  if (phylvers >= 4) fprintf(foo," %d %d\n", i, seqlen);
                  else fprintf(foo," %d %d YF\n", i, seqlen);
                  }
                seqlen0 = seqlen;
                }
              else if (seqlen != seqlen0) {
                erralert(eUnequalSize);
                if (seqlen < seqlen0) seq = (char *)realloc(seq, seqlen0);
                for (i=seqlen; i<seqlen0; i++) seq[i]= gPretty.gapchar;
                seqlen = seqlen0;
                seq[seqlen] = 0; 
                }
              break;

            case kPAUP:
              if (seqout == 0) {
                seqtype= getseqtype(seq, seqlen);
                seqlen0 = seqlen;
                }
              else if (seqlen != seqlen0) {
                erralert(eUnequalSize);
                if (seqlen < seqlen0) seq = (char *)realloc(seq, seqlen0); 
                for (i=seqlen; i<seqlen0; i++) seq[i]= gPretty.gapchar;
                seqlen = seqlen0;
                seq[seqlen] = 0; 
                }
              break;

            }

          if (doupper)
            for (i = 0; i<seqlen; i++) seq[i] = to_upper(seq[i]);
          else if (dolower)
            for (i = 0; i<seqlen; i++) seq[i] = to_lower(seq[i]);

          if (doreverse) {
            long  j, k;
            char  ctemp;
            for (j=0, k=seqlen-1; j <= k; j++, k--) {
              ctemp = compl[seq[j] - ' '];
              seq[j] = compl[seq[k] - ' '];
              seq[k] = ctemp;
              }
            }

          if ((gPretty.isactive || outform==kPAUP) && gPretty.domatch && firstseq != NULL) {
            for (i=0; i<seqlen; i++)
              if (seq[i]==firstseq[i]) seq[i]= gPretty.matchchar;
            }


          if (gPretty.isactive && gPretty.numtop && seqout == 0) {
            gPretty.numline = 1;
            indexout();
            (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
            gPretty.numline = 2;
            indexout();
            (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
            gPretty.numline = 0;
            }

          indexout();
          nlines = writeSeq( fout, seq, seqlen, outform, seqidptr);
          seqout++;
          }

        if ((gPretty.isactive || outform==kPAUP) && gPretty.domatch && firstseq == NULL) {
          firstseq= seq;
          seq = NULL;
          }
        else if (seq!=NULL) { free(seq); seq = NULL; }

#ifdef NCBI
       if ( (format == kASNseqentry || format == kASNseqset)
          && seqidptr && seqidptr!= seqid)
            free(seqidptr);
#endif
        if (chooseall) whichSeq++;
        else if (iwhichlist<nwhichlist) whichSeq= whichlist[iwhichlist++];
        else whichSeq= 0;
        }
      if (closein) { fclose(fin); closein= false; }
      }
    whichSeq  = 0;
  } while (nfile > 0 || !quietly);


fini:
  if (firstseq) { free(firstseq); firstseq= NULL; }
  if (err || listonly) exit_main(err);

  if (gPretty.isactive && gPretty.numbot) {
    gPretty.numline = 2;
    indexout();
    (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
    gPretty.numline = 1;
    indexout();
    (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
    gPretty.numline = 0;
    }

  if (outform == kMSF) {
    if (*oname) cp= oname; else cp= inputfile;
    fprintf(foo,"\n %s  MSF: %d  Type: N  January 01, 1776  12:00  Check: %d ..\n\n",
                  cp, seqlen, checkall);
    }

  if (outform == kPAUP) {
    fprintf(foo,"#NEXUS\n");
    if (*oname) cp= oname; else cp= inputfile;
    fprintf(foo,"[%s -- data title]\n\n", cp);
    /* ! now have header lines for each sequence... put them before "begin data;... */
    }

  if (outform==kPhylip && interleaved) {
    if (phylvers >= 4) fprintf(foo," %d %d\n", seqout, seqlen);
    else fprintf(foo," %d %d YF\n", seqout, seqlen);
    }

  if (interleaved) {
    /* interleave species lines in true output */
    /* nlines is # lines / sequence */
    short iline, j, leaf, iseq;
    char  *s = stempstore;

    indexout();  noutindex--; /* mark eof */

    for (leaf=0; leaf<nlines; leaf++) {
      if (outform == kMSF && leaf == 1) {
        fputs("//\n\n", foo);
        }
      if (outform == kPAUP && leaf==1) {
        switch (seqtype) {
          case kDNA     : cp= "dna"; break;
          case kRNA     : cp= "rna"; break;
          case kNucleic : cp= "dna"; break;
          case kAmino   : cp= "protein"; break;
          case kOtherSeq: cp= "dna"; break;
          }
        fprintf(foo,"\nbegin data;\n");
        fprintf(foo," dimensions ntax=%d nchar=%d;\n", seqout, seqlen);
        fprintf(foo," format datatype=%s interleave missing=%c", cp, gPretty.gapchar);
        if (gPretty.domatch) fprintf(foo," matchchar=%c", gPretty.matchchar);
        fprintf(foo,";\n  matrix\n");
        }

      for (iseq=0; iseq<noutindex; iseq++) {
        fseek(ftmp, outindex[iseq], 0);
        for (iline=0; iline<=leaf; iline++)
          if (!fgets(s, 256, ftmp)) *s= 0;
        if (ftell(ftmp) <= outindex[iseq+1])
          fputs( s, foo);
        }

      for (j=0; j<gPretty.interline; j++)
        fputs( "\n", foo);  /* some want spacer line */
      }
    fclose(ftmp); /* tmp disappears */
    fout= foo;
    }

  if (outform == kASN1)  fprintf( foo, "} }\n");
  if (outform == kPAUP)  fprintf( foo,";\n  end;\n");

  if (outindex != NULL) free(outindex);
  exit_main(0);
}


SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'ureadseq.c'" '(52582 characters)'
if test -f 'ureadseq.c'
then
	echo shar: will not over-write existing file "'ureadseq.c'"
else
cat << \SHAR_EOF > 'ureadseq.c'
/* File: ureadseq.c
 *
 * Reads and writes nucleic/protein sequence in various
 * formats. Data files may have multiple sequences.
 *
 * Copyright 1990 by d.g.gilbert
 * biology dept., indiana university, bloomington, in 47405
 * e-mail: gilbertd@bio.indiana.edu
 *
 * This program may be freely copied and used by anyone.
 * Developers are encourged to incorporate parts in their
 * programs, rather than devise their own private sequence
 * format.
 *
 * This should compile and run with any ANSI C compiler.
 *
 */


#include <stdio.h>
#include <ctype.h>
#include <string.h>

#define UREADSEQ_G
#include "ureadseq.h"

#pragma segment ureadseq


int Strcasecmp(const char *a, const char *b)  /* from Nlm_StrICmp */
{
  int diff, done;
  if (a == b)  return 0;
  done = 0;
  while (! done) {
    diff = to_upper(*a) - to_upper(*b);
    if (diff) return diff;
    if (*a == '\0') done = 1;
    else { a++; b++; }
    }
  return 0;
}

int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
{
  int diff, done;
  if (a == b)  return 0;
  done = 0;
  while (! done) {
    diff = to_upper(*a) - to_upper(*b);
    if (diff) return diff;
    if (*a == '\0') done = 1;
    else {
      a++; b++; maxn--;
      if (! maxn) done = 1;
      }
    }
  return 0;
}





#ifndef Local
# define Local      static    /* local functions */
#endif

#define kStartLength  500

const char *aminos      = "ABCDEFGHIKLMNPQRSTVWXYZ*";
const char *primenuc    = "ACGTU";
const char *protonly    = "EFIPQZ";

const char kNocountsymbols[5]  = "_.-?";
const char stdsymbols[6]  = "_.-*?";
const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
static const char *seqsymbols   = allsymbols;

const char nummask[11]   = "0123456789";
const char nonummask[11] = "~!@#$%^&*(";

/*
    use general form of isseqchar -- all chars + symbols.
    no formats except nbrf (?) use symbols in data area as
    anything other than sequence chars.
*/



                          /* Local variables for readSeq: */
struct ReadSeqVars {
  short choice, err, nseq;
  long  seqlen, maxseq, seqlencount;
  short topnseq;
  long  topseqlen;
  const char *fname;
  char *seq, *seqid, matchchar;
  boolean allDone, done, filestart, addit;
  FILE  *f;
  long  linestart;
  char  s[256], *sp;

  int (*isseqchar)();
  /* int  (*isseqchar)(int c);  << sgi cc hates (int c) */
};



int isSeqChar(int c)
{
  return (isalpha(c) || strchr(seqsymbols,c));
}

int isSeqNumChar(int c)
{
  return (isalnum(c) || strchr(seqsymbols,c));
}


int isAnyChar(int c)
{
  return isascii(c); /* wrap in case isascii is macro */
}

Local void readline(FILE *f, char *s, long *linestart)
{
  char  *cp;

  *linestart= ftell(f);
  if (NULL == fgets(s, 256, f))
    *s = 0;
  else {
    cp = strchr(s, '\n');
    if (cp != NULL) *cp = 0;
    }
}

Local void getline(struct ReadSeqVars *V)
{
  readline(V->f, V->s, &V->linestart);
}

Local void ungetline(struct ReadSeqVars *V)
{
  fseek(V->f, V->linestart, 0);
}


Local void addseq(char *s, struct ReadSeqVars *V)
{
  char  *ptr;

  if (V->addit) while (*s != 0) {
    if ((V->isseqchar)(*s)) {
      if (V->seqlen >= V->maxseq) {
        V->maxseq += kStartLength;
        ptr = (char*) realloc(V->seq, V->maxseq+1);
        if (ptr==NULL) {
          V->err = eMemFull;
          return;
          }
        else V->seq = ptr;
        }
      V->seq[(V->seqlen)++] = *s;
      }
    s++;
    }
}

Local void countseq(char *s, struct ReadSeqVars *V)
 /* this must count all valid seq chars, for some formats (paup-sequential) even
    if we are skipping seq... */
{
  while (*s != 0) {
    if ((V->isseqchar)(*s)) {
      (V->seqlencount)++;
      }
    s++;
    }
}


Local void addinfo(char *s, struct ReadSeqVars *V)
{
  char s2[256], *si;
  boolean saveadd;

  si = s2;
  while (*s == ' ') s++;
  sprintf(si, " %d)  %s\n", V->nseq, s);

  saveadd = V->addit;
  V->addit = true;
  V->isseqchar = isAnyChar;
  addseq( si, V);
  V->addit = saveadd;
  V->isseqchar = isSeqChar;
}




Local void readLoop(short margin, boolean addfirst,
            boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
            struct ReadSeqVars *V)
{
  boolean addend = false;
  boolean ungetend = false;

  V->nseq++;
  if (V->choice == kListSequences) V->addit = false;
  else V->addit = (V->nseq == V->choice);
  if (V->addit) V->seqlen = 0;

  if (addfirst) addseq(V->s, V);
  do {
    getline(V);
    V->done = feof(V->f);
    V->done |= (*endTest)( &addend, &ungetend, V);
    if (V->addit && (addend || !V->done) && (strlen(V->s) > margin)) {
      addseq( (V->s)+margin, V);
    }
  } while (!V->done);

  if (V->choice == kListSequences) addinfo(V->seqid, V);
  else {
    V->allDone = (V->nseq >= V->choice);
    if (V->allDone && ungetend) ungetline(V);
    }
}



Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = true; /* 1 or 2 occur in line w/ bases */
  *ungetend= false;
  return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
}

Local void readIG(struct ReadSeqVars *V)
{
/* 18Aug92: new IG format -- ^L between sequences in place of ";" */
  char  *si;

  while (!V->allDone) {
    do {
      getline(V);
      for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
      if (*si == 0) *V->s= 0; /* chop line to empty */
    } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
    if (feof(V->f))
      V->allDone = true;
    else {
      strcpy(V->seqid, V->s);
      readLoop(0, false, endIG, V);
      }
  }
}



Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= false;
  return (strstr( V->s, "//") != NULL);
}

Local void readStrider(struct ReadSeqVars *V)
{ /* ? only 1 seq/file ? */

  while (!V->allDone) {
    getline(V);
    if (strstr(V->s,"; DNA sequence  ") == V->s)
      strcpy(V->seqid, (V->s)+16);
    else
      strcpy(V->seqid, (V->s)+1);
    while ((!feof(V->f)) && (*V->s == ';')) {
      getline(V);
      }
    if (feof(V->f)) V->allDone = true;
    else readLoop(0, true, endStrider, V);
  }
}


Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= (strstr(V->s,"ENTRY") == V->s);
  return ((strstr(V->s,"///") != NULL) || *ungetend);
}

Local void readPIR(struct ReadSeqVars *V)
{ /*PIR -- many seqs/file */

  while (!V->allDone) {
    while (! (feof(V->f) || strstr(V->s,"ENTRY")  || strstr(V->s,"SEQUENCE")) )
      getline(V);
    strcpy(V->seqid, (V->s)+16);
    while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
      getline(V);
    readLoop(0, false, endPIR, V);

    if (!V->allDone) {
     while (! (feof(V->f) || ((*V->s != 0)
       && (strstr( V->s,"ENTRY") == V->s))))
        getline(V);
      }
    if (feof(V->f)) V->allDone = true;
  }
}


Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= (strstr(V->s,"LOCUS") == V->s);
  return ((strstr(V->s,"//") != NULL) || *ungetend);
}

Local void readGenBank(struct ReadSeqVars *V)
{ /*GenBank -- many seqs/file */

  while (!V->allDone) {
    strcpy(V->seqid, (V->s)+12);
    while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
      getline(V);
    readLoop(0, false, endGB, V);

    if (!V->allDone) {
     while (! (feof(V->f) || ((*V->s != 0)
       && (strstr( V->s,"LOCUS") == V->s))))
        getline(V);
      }
    if (feof(V->f)) V->allDone = true;
  }
}


Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  char  *a;

  if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
    /* "*" can be valid base symbol, drop it here */
    *a = 0;
    *addend = true;
    *ungetend= false;
    return(true);
    }
  else if (*V->s == '>') { /* start of next seq */
    *addend = false;
    *ungetend= true;
    return(true);
    }
  else
    return(false);
}

Local void readNBRF(struct ReadSeqVars *V)
{
  while (!V->allDone) {
    strcpy(V->seqid, (V->s)+4);
    getline(V);   /*skip title-junk line*/
    readLoop(0, false, endNBRF, V);
    if (!V->allDone) {
     while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
        getline(V);
      }
    if (feof(V->f)) V->allDone = true;
  }
}



Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= true;
  return(*V->s == '>');
}

Local void readPearson(struct ReadSeqVars *V)
{
  while (!V->allDone) {
    strcpy(V->seqid, (V->s)+1);
    readLoop(0, false, endPearson, V);
    if (!V->allDone) {
     while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
        getline(V);
      }
    if (feof(V->f)) V->allDone = true;
  }
}



Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= (strstr(V->s,"ID   ") == V->s);
  return ((strstr(V->s,"//") != NULL) || *ungetend);
}

Local void readEMBL(struct ReadSeqVars *V)
{
  while (!V->allDone) {
    strcpy(V->seqid, (V->s)+5);
    do {
      getline(V);
    } while (!(feof(V->f) | (strstr(V->s,"SQ   ") == V->s)));

    readLoop(0, false, endEMBL, V);
    if (!V->allDone) {
      while (!(feof(V->f) |
         ((*V->s != '\0') & (strstr(V->s,"ID   ") == V->s))))
      getline(V);
    }
    if (feof(V->f)) V->allDone = true;
  }
}



Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= true;
  return( *V->s == '(' );
}

Local void readZuker(struct ReadSeqVars *V)
{
  /*! 1st string is Zuker's Fortran format */

  while (!V->allDone) {
    getline(V);  /*s == "seqLen seqid string..."*/
    strcpy(V->seqid, (V->s)+6);
    readLoop(0, false, endZuker, V);
    if (!V->allDone) {
      while (!(feof(V->f) |
        ((*V->s != '\0') & (*V->s == '('))))
          getline(V);
      }
    if (feof(V->f)) V->allDone = true;
  }
}



Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  /* this is a somewhat shaky end,
    1st char of line is non-blank for seq. title
  */
  *addend = false;
  *ungetend= true;
  return( *V->s != ' ' );
}

Local void readFitch(struct ReadSeqVars *V)
{
  boolean first;

  first = true;
  while (!V->allDone) {
    if (!first) strcpy(V->seqid, V->s);
    readLoop(0, first, endFitch, V);
    if (feof(V->f)) V->allDone = true;
    first = false;
    }
}


Local void readPlain(struct ReadSeqVars *V)
{
  V->nseq++;
  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  addseq(V->seqid, V);   /*from above..*/
  if (V->fname!=NULL) sprintf(V->seqid, "%s  [Unknown form]", V->fname);
  else sprintf(V->seqid, "  [Unknown form]");
  do {
    addseq(V->s, V);
    V->done = feof(V->f);
    getline(V);
  } while (!V->done);
  if (V->choice == kListSequences) addinfo(V->seqid, V);
  V->allDone = true;
}


Local void readUWGCG(struct ReadSeqVars *V)
{
/*
10nov91: Reading GCG files casued duplication of last line when
         EOF followed that line !!!
    fix: getline now sets *V->s = 0
*/
  char  *si;

  V->nseq++;
  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  strcpy(V->seqid, V->s);
  /*writeseq: "    %s  Length: %d  (today)  Check: %d  ..\n" */
  /*drop above or ".." from id*/
  if (si = strstr(V->seqid,"  Length: ")) *si = 0;
  else if (si = strstr(V->seqid,"..")) *si = 0;
  do {
    V->done = feof(V->f);
    getline(V);
    if (!V->done) addseq((V->s), V);
  } while (!V->done);
  if (V->choice == kListSequences) addinfo(V->seqid, V);
  V->allDone = true;
}


Local void readOlsen(struct ReadSeqVars *V)
{ /* G. Olsen /print output from multiple sequence editor */

  char    *si, *sj, *sk, *sm, sid[40], snum[20];
  boolean indata = false;
  int snumlen;

  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  rewind(V->f); V->nseq= 0;
  do {
    getline(V);
    V->done = feof(V->f);

    if (V->done && !(*V->s)) break;
    else if (indata) {
      if ( (si= strstr(V->s, sid))
        /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
        && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {

        /* Spaces are valid alignment data !! */
/* 17Oct91: Error, the left margin is 21 not 22! */
/* dropped some nucs up to now -- my example file was right shifted ! */
/* variable right id margin, drop id-2 spaces at end */
/*
  VMS CC COMPILER (VAXC031) mess up:
  -- Index of 21 is chopping 1st nuc on VMS systems Only!
  Byte-for-byte same ame rnasep.olsen sequence file !
*/

        /* si = (V->s)+21; < was this before VMS CC wasted my time */
        si += 10;  /* use strstr index plus offset to outfox VMS CC bug */

        if (sk = strstr(si, sid)) *(sk-2) = 0;
        for (sk = si; *sk != 0; sk++) {
           if (*sk == ' ') *sk = '.';
           /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
           else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
           }

        addseq(si, V);
        }
      }

    else if (sk = strstr(V->s, "): ")) {  /* seq info header line */
  /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
  /*   3 (Agr.tume):  agrobacterium.prna  18-JUN-1987 16:12 */
  /* 328 (Agr.tume):  agrobacterium.prna XYZ  19-DEC-1992   */
      (V->nseq)++;
      si = 1 + strchr(V->s,'(');
      *sk = ' ';
      if (V->choice == kListSequences) addinfo( si, V);
      else if (V->nseq == V->choice) {
        strcpy(V->seqid, si);
        sj = strchr(V->seqid, ':');
        while (*(--sj) == ' ') ;
        while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }

        *sk = 0;
        while (*(--sk) == ' ') *sk = 0;
        strcpy(sid, si);

        si= V->s;
        while ((*si <= ' ') && (*si != 0)) si++;
        snumlen=0;
        while (si[snumlen] > ' ' && snumlen<20)
         { snum[snumlen]= si[snumlen]; snumlen++; }
        snum[snumlen]= 0;
        }

      }

    else if (strstr(V->s,"identity:   Data:")) {
      indata = true;
      if (V->choice == kListSequences) V->done = true;
      }

  } while (!V->done);

  V->allDone = true;
} /*readOlsen*/


Local void readMSF(struct ReadSeqVars *V)
{ /* gcg's MSF, mult. sequence format, interleaved ! */

  char    *si, *sj, sid[128];
  boolean indata = false;
  int     atseq= 0, iline= 0;

  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  rewind(V->f); V->nseq= 0;
  do {
    getline(V);
    V->done = feof(V->f);

    if (V->done && !(*V->s)) break;
    else if (indata) {
      /*somename  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
      /*       E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */

      si= V->s;
      skipwhitespace(si);
      /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
      for (sj= si; *sj > ' '; sj++) ;
      *sj= 0;
      if ( *si ) {
        if ( (0==strcmp(si, sid)) ) {
          addseq(sj+1, V);
          }
        iline++;
        }
      }

    else if (NULL != (si = strstr(V->s, "Name: "))) {  /* seq info header line */
      /* Name: somename      Len:   100  Check: 7009  Weight:  1.00 */

      (V->nseq)++;
      si += 6;
      if (V->choice == kListSequences) addinfo( si, V);
      else if (V->nseq == V->choice) {
        strcpy(V->seqid, si);
        si = V->seqid;
        skipwhitespace(si);
        /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
        for (sj= si; *sj > ' '; sj++) ;
        *sj= 0;
        strcpy(sid, si);
        }
      }

    else if ( strstr(V->s,"//") /*== V->s*/ )  {
      indata = true;
      iline= 0;
      if (V->choice == kListSequences) V->done = true;
      }

  } while (!V->done);


  V->allDone = true;
} /*readMSF*/



Local void readPAUPinterleaved(struct ReadSeqVars *V)
{ /* PAUP mult. sequence format, interleaved or sequential! */

  char    *si, *sj, *send, sid[40], sid1[40], saveseq[255];
  boolean first = true, indata = false, domatch;
  int     atseq= 0, iline= 0, ifmc, saveseqlen=0;

#define fixmatchchar(s) { \
  for (ifmc=0; ifmc<saveseqlen; ifmc++) \
    if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }

  V->addit = (V->choice > 0);
  V->seqlencount = 0;
  if (V->addit) V->seqlen = 0;
  /* rewind(V->f); V->nseq= 0;  << do in caller !*/
  indata= true; /* call here after we find "matrix" */
  domatch= (V->matchchar > 0);

  do {
    getline(V);
    V->done = feof(V->f);

    if (V->done && !(*V->s)) break;
    else if (indata) {
      /* [         1                    1                    1         ]*/
      /* human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
      /* chimp     ................a.t. .c.................a ..........*/
      /* !! need to correct for V->matchchar */
      si= V->s;
      skipwhitespace(si);
      if (strchr(si,';')) indata= false;

      if (isalnum(*si))  {
        /* valid data line starts w/ a left-justified seq name in columns [0..8] */
        if (first) {
          (V->nseq)++;
          if (V->nseq >= V->topnseq) first= false;
          for (sj = si; isalnum(*sj); sj++) ;
          send= sj;
          skipwhitespace(sj);
          if (V->choice == kListSequences) {
            *send= 0;
            addinfo( si, V);
            }
          else if (V->nseq == V->choice) {
            if (domatch) {
              if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
              else fixmatchchar( sj);
              }
            addseq(sj, V);
            *send= 0;
            strcpy(V->seqid, si);
            strcpy(sid, si);
            if (V->nseq == 1) strcpy(sid1, sid);
            }
          }

        else if ( (strstr(si, sid) == si) ){
          while (isalnum(*si)) si++;
          skipwhitespace(si);
          if (domatch) {
            if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
            else fixmatchchar( si);
            }
          addseq(si, V);
          }

        else if (domatch && (strstr(si, sid1) == si)) {
          strcpy( saveseq, si);
          saveseqlen= strlen(saveseq);
          }

        iline++;
        }
      }

    else if ( strstr(V->s,"matrix") )  {
      indata = true;
      iline= 0;
      if (V->choice == kListSequences) V->done = true;
      }

  } while (!V->done);

  V->allDone = true;
} /*readPAUPinterleaved*/



Local void readPAUPsequential(struct ReadSeqVars *V)
{ /* PAUP mult. sequence format, interleaved or sequential! */
  char    *si, *sj;
  boolean atname = true, indata = false;

  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  V->seqlencount = 0;
  /* rewind(V->f); V->nseq= 0;  << do in caller !*/
  indata= true; /* call here after we find "matrix" */
  do {
    getline(V);
    V->done = feof(V->f);

    if (V->done && !(*V->s)) break;
    else if (indata) {
      /* [         1                    1                    1         ]*/
      /* human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
      /*           aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
      /* chimp     ................a.t. .c.................a ..........*/
      /*           ................a.t. .c.................a ..........*/

      si= V->s;
      skipwhitespace(si);
      if (strchr(si,';')) indata= false;
      if (isalnum(*si))  {
        /* valid data line starts w/ a left-justified seq name in columns [0..8] */
        if (atname) {
          (V->nseq)++;
          V->seqlencount = 0;
          atname= false;
          sj= si+1;
          while (isalnum(*sj)) sj++;
          if (V->choice == kListSequences) {
            /* !! we must count bases to know when topseqlen is reached ! */
            countseq(sj, V);
            if (V->seqlencount >= V->topseqlen) atname= true;
            *sj= 0;
            addinfo( si, V);
            }
          else if (V->nseq == V->choice) {
            addseq(sj, V);
            V->seqlencount= V->seqlen;
            if (V->seqlencount >= V->topseqlen) atname= true;
            *sj= 0;
            strcpy(V->seqid, si);
            }
          else {
            countseq(sj, V);
            if (V->seqlencount >= V->topseqlen) atname= true;
            }
          }

        else if (V->nseq == V->choice) {
          addseq(V->s, V);
          V->seqlencount= V->seqlen;
          if (V->seqlencount >= V->topseqlen) atname= true;
          }
        else {
          countseq(V->s, V);
          if (V->seqlencount >= V->topseqlen) atname= true;
          }
        }
      }

    else if ( strstr(V->s,"matrix") )  {
      indata = true;
      atname= true;
      if (V->choice == kListSequences) V->done = true;
      }

  } while (!V->done);

  V->allDone = true;
} /*readPAUPsequential*/


Local void readPhylipInterleaved(struct ReadSeqVars *V)
{
  char    *si, *sj;
  boolean first = true;
  int     iline= 0;

  V->addit = (V->choice > 0);
  if (V->addit) V->seqlen = 0;
  V->seqlencount = 0;
  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
  si= V->s;
  skipwhitespace(si);
  V->topnseq= atoi(si);
  while (isdigit(*si)) si++;
  skipwhitespace(si);
  V->topseqlen= atol(si);
  /* fprintf(stderr,"Phylip-ileaf: topnseq=%d  topseqlen=%d\n",V->topnseq, V->topseqlen); */

  do {
    getline(V);
    V->done = feof(V->f);

    if (V->done && !(*V->s)) break;
    si= V->s;
    skipwhitespace(si);
    if (*si != 0) {

      if (first) {  /* collect seq names + seq, as fprintf(outf,"%-10s  ",seqname); */
        (V->nseq)++;
        if (V->nseq >= V->topnseq) first= false;
        sj= V->s+10;  /* past name, start of data */
        if (V->choice == kListSequences) {
          *sj= 0;
          addinfo( si, V);
          }
        else if (V->nseq == V->choice) {
          addseq(sj, V);
          *sj= 0;
          strcpy(V->seqid, si);
          }
        }
      else if ( iline % V->nseq == V->choice -1 ) {
        addseq(si, V);
        }
      iline++;
    }
  } while (!V->done);

  V->allDone = true;
} /*readPhylipInterleaved*/



Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
{
  *addend = false;
  *ungetend= false;
  countseq( V->s, V);
  return V->seqlencount >= V->topseqlen;
}

Local void readPhylipSequential(struct ReadSeqVars *V)
{
  short  i;
  char  *si;
  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
  si= V->s;
  skipwhitespace(si);
  V->topnseq= atoi(si);
  while (isdigit(*si)) si++;
  skipwhitespace(si);
  V->topseqlen= atol(si);
  getline(V);
  while (!V->allDone) {
    V->seqlencount= 0;
    strncpy(V->seqid, (V->s), 10);
    V->seqid[10]= 0;
    for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
    readLoop(0, true, endPhylipSequential, V);
    if (feof(V->f)) V->allDone = true;
    }
}




Local void readSeqMain(
      struct ReadSeqVars *V,
      const long  skiplines_,
      const short format_)
{
#define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
  for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }

  boolean gotuw;
  long l;

  V->linestart= 0;
  V->matchchar= 0;
  if (V->f == NULL)
    V->err = eFileNotFound;
  else {

    for (l = skiplines_; l > 0; l--) getline( V);

    do {
      getline( V);
      for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
    } while ((l == 0) && !feof(V->f));

    if (feof(V->f)) V->err = eNoData;

    else switch (format_) {
      case kPlain : readPlain(V); break;
      case kIG    : readIG(V); break;
      case kStrider: readStrider(V); break;
      case kGenBank: readGenBank(V); break;
      case kPIR   : readPIR(V); break;
      case kNBRF  : readNBRF(V); break;
      case kPearson: readPearson(V); break;
      case kEMBL  : readEMBL(V); break;
      case kZuker : readZuker(V); break;
      case kOlsen : readOlsen(V); break;
      case kMSF   : readMSF(V); break;

      case kPAUP    : {
        boolean done= false;
        boolean interleaved= false;
        char  *cp;
        /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
        while (!done) {
          getline( V);
          tolowerstr( V->s);
          if (strstr( V->s, "matrix")) done= true;
          if (strstr( V->s, "interleav")) interleaved= true;
          if (NULL != (cp=strstr( V->s, "ntax=")) )  V->topnseq= atoi(cp+5);
          if (NULL != (cp=strstr( V->s, "nchar=")) )  V->topseqlen= atoi(cp+6);
          if (NULL != (cp=strstr( V->s, "matchchar=")) )  {
            cp += 10;
            if (*cp=='\'') cp++;
            else if (*cp=='"') cp++;
            V->matchchar= *cp;
            }
          }
        if (interleaved) readPAUPinterleaved(V);
        else readPAUPsequential(V);
        }
        break;

      /* kPhylip: ! can't determine in middle of file which type it is...*/
      /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
      case kPhylip2:
        readPhylipSequential(V);
        break;
      case kPhylip4: /* == kPhylip3 */
        readPhylipInterleaved(V);
        break;

      default:
        V->err = eUnknownFormat;
        break;

      case kFitch :
        strcpy(V->seqid, V->s); getline(V);
        readFitch(V);
        break;

      case kGCG:
        do {
          gotuw = (strstr(V->s,"..") != NULL);
          if (gotuw) readUWGCG(V);
          getline(V);
        } while (!(feof(V->f) || V->allDone));
        break;
      }
    }

  V->filestart= false;
  V->seq[V->seqlen] = 0; /* stick a string terminator on it */
}


char *readSeqFp(
      const short whichEntry_,  /* index to sequence in file */
      FILE  *fp_,   /* pointer to open seq file */
      const long  skiplines_,
      const short format_,      /* sequence file format */
      long  *seqlen_,     /* return seq size */
      short *nseq_,       /* number of seqs in file, for listSeqs() */
      short *error_,      /* return error */
      char  *seqid_)      /* return seq name/info */
{
  struct ReadSeqVars V;

  if (format_ < kMinFormat || format_ > kMaxFormat) {
    *error_ = eUnknownFormat;
    *seqlen_ = 0;
    return NULL;
    }

  V.choice = whichEntry_;
  V.fname  = NULL;  /* don't know */
  V.seq    = (char*) calloc(1, kStartLength+1);
  V.maxseq = kStartLength;
  V.seqlen = 0;
  V.seqid  = seqid_;

  V.f = fp_;
  V.filestart= (ftell( fp_) == 0);
  /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
  if (V.filestart)  V.nseq = 0;
  else V.nseq= *nseq_;  /* track where we are in file...*/

  *V.seqid = '\0';
  V.err = 0;
  V.nseq = 0;
  V.isseqchar = isSeqChar;
  if (V.choice == kListSequences) ; /* leave as is */
  else if (V.choice <= 0) V.choice = 1; /* default ?? */
  V.addit = (V.choice > 0);
  V.allDone = false;

  readSeqMain(&V, skiplines_, format_);

  *error_ = V.err;
  *seqlen_ = V.seqlen;
  *nseq_ = V.nseq;
  return V.seq;
}

char *readSeq(
      const short whichEntry_,  /* index to sequence in file */
      const char  *filename_,   /* file name */
      const long  skiplines_,
      const short format_,      /* sequence file format */
      long  *seqlen_,     /* return seq size */
      short *nseq_,       /* number of seqs in file, for listSeqs() */
      short *error_,      /* return error */
      char  *seqid_)      /* return seq name/info */
{
  struct ReadSeqVars V;

  if (format_ < kMinFormat || format_ > kMaxFormat) {
    *error_ = eUnknownFormat;
    *seqlen_ = 0;
    return NULL;
    }

  V.choice = whichEntry_;
  V.fname  = filename_;  /* don't need to copy string, just ptr to it */
  V.seq    = (char*) calloc(1, kStartLength+1);
  V.maxseq = kStartLength;
  V.seqlen = 0;
  V.seqid  = seqid_;

  V.f = NULL;
  *V.seqid = '\0';
  V.err = 0;
  V.nseq = 0;
  V.isseqchar = isSeqChar;
  if (V.choice == kListSequences) ; /* leave as is */
  else if (V.choice <= 0) V.choice = 1; /* default ?? */
  V.addit = (V.choice > 0);
  V.allDone = false;

  V.f = fopen(V.fname, "r");
  V.filestart= true;

  readSeqMain(&V, skiplines_, format_);

  if (V.f != NULL) fclose(V.f);
  *error_ = V.err;
  *seqlen_ = V.seqlen;
  *nseq_ = V.nseq;
  return V.seq;
}





char *listSeqs(
      const char  *filename_,   /* file name */
      const long skiplines_,
      const short format_,      /* sequence file format */
      short *nseq_,       /* number of seqs in file, for listSeqs() */
      short *error_)      /* return error */
{
  char  seqid[256];
  long  seqlen;

  return readSeq( kListSequences, filename_, skiplines_, format_,
                  &seqlen, nseq_, error_, seqid);
}




short seqFileFormat(    /* return sequence format number, see ureadseq.h */
    const char *filename,
    long  *skiplines,   /* return #lines to skip any junk like mail header */
    short *error)       /* return any error value or 0 */
{
  FILE      *fseq;
  short      format;

  fseq  = fopen(filename, "r");
  format= seqFileFormatFp( fseq, skiplines, error);
  if (fseq!=NULL) fclose(fseq);
  return format;
}

short seqFileFormatFp(
    FILE *fseq,
    long  *skiplines,   /* return #lines to skip any junk like mail header */
    short *error)       /* return any error value or 0 */
{
  boolean   foundDNA= false, foundIG= false, foundStrider= false,
            foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
            foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
            gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
            isfitch= false,  isphylip= false, done= false;
  short     format= kUnknown;
  int       nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
  char      sp[256];
  long      linestart=0;
  int     maxlines2check=500;

#define ReadOneLine(sp)   \
  { done |= (feof(fseq)); \
    readline( fseq, sp, &linestart);  \
    if (!done) { splen = strlen(sp); ++nlines; } }

  *skiplines = 0;
  *error = 0;
  if (fseq == NULL) { *error = eFileNotFound;  return kNoformat; }

  while ( !done ) {
    ReadOneLine(sp);

    /* check for mailer head & skip past if found */
    if (nlines < 4 && !done) {
      if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
        do {
          /* skip all lines until find one blank line */
          ReadOneLine(sp);
          if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
          } while ((!done) && (k < splen));
        *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
        }
      }

    if (sp==NULL || *sp==0)
      ; /* nada */

    /* high probability identities: */

    else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
      gotMSF= true;

    else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
      gotuw= true;

    else if (strstr(sp,"identity:   Data:") != NULL)
      gotolsen= true;

    else if ( strstr(sp,"::=") &&
      (strstr(sp,"Bioseq") ||       /* Bioseq or Bioseq-set */
       strstr(sp,"Seq-entry") ||
       strstr(sp,"Seq-submit") ) )  /* can we read submit format? */
          gotasn1= true;

    else if ( strstr(sp,"#NEXUS") == sp )
      gotpaup= true;

    /* uncertain identities: */

    else if (*sp ==';') {
      if (strstr(sp,"Strider") !=NULL) foundStrider= true;
      else foundIG= true;
      }

    else if (strstr(sp,"LOCUS") == sp)
      foundGB= true;
    else if (strstr(sp,"ORIGIN") == sp)
      foundGB= true;

    else if (strstr(sp,"ENTRY   ") == sp)  /* ? also (strcmp(sp,"\\\\\\")==0) */
      foundPIR= true;
    else if (strstr(sp,"SEQUENCE") == sp)
      foundPIR= true;

    else if (*sp == '>') {
      if (sp[3] == ';') foundNBRF= true;
      else foundPearson= true;
      }

    else if (strstr(sp,"ID   ") == sp)
      foundEMBL= true;
    else if (strstr(sp,"SQ   ") == sp)
      foundEMBL= true;

    else if (*sp == '(')
      foundZuker= true;

    else {
      if (nlines - *skiplines == 1) {
        int ispp= 0, ilen= 0;
        sscanf( sp, "%d%d", &ispp, &ilen);
        if (ispp > 0 && ilen > 0) isphylip= true;
        }
      else if (isphylip && nlines - *skiplines == 2) {
        int  tseq;
        tseq= getseqtype(sp+10, strlen(sp+10));
        if ( isalpha(*sp)     /* 1st letter in 2nd line must be of a name */
         && (tseq != kOtherSeq))  /* sequence section must be okay */
            foundPhylip= true;
        }

      for (k=0, isfitch= true; isfitch & (k < splen); k++) {
        if (k % 4 == 0) isfitch &= (sp[k] == ' ');
        else isfitch &= (sp[k] != ' ');
        }
      if (isfitch & (splen > 20)) foundFitch= true;

      /* kRNA && kDNA are fairly certain...*/
      switch (getseqtype( sp, splen)) {
        case kOtherSeq: otherlines++; break;
        case kAmino   : if (splen>20) aminolines++; break;
        case kDNA     :
        case kRNA     : if (splen>20) dnalines++; break;
        case kNucleic : break; /* not much info ? */
        }

      }

                    /* pretty certain */
    if (gotolsen) {
      format= kOlsen;
      done= true;
      }
    else if (gotMSF) {
      format= kMSF;
      done= true;
      }
    else if (gotasn1) {
      /* !! we need to look further and return  kASNseqentry | kASNseqset */
      /*
        seqentry key is Seq-entry ::=
        seqset key is Bioseq-set ::=
        ?? can't read these yet w/ ncbi tools ??
          Seq-submit ::=
          Bioseq ::=  << fails both bioseq-seq and seq-entry parsers !
      */
      if (strstr(sp,"Bioseq-set")) format= kASNseqset;
      else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
      else format= kASN1;  /* other form, we can't yet read... */
      done= true;
      }
    else if (gotpaup) {
      format= kPAUP;
      done= true;
      }

    else if (gotuw) {
      if (foundIG) format= kIG;  /* a TOIG file from GCG for certain */
      else format= kGCG;
      done= true;
      }

    else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
          /* decide on most likely format */
          /* multichar idents: */
      if (foundStrider) format= kStrider;
      else if (foundGB) format= kGenBank;
      else if (foundPIR) format= kPIR;
      else if (foundEMBL) format= kEMBL;
      else if (foundNBRF) format= kNBRF;
          /* single char idents: */
      else if (foundIG) format= kIG;
      else if (foundPearson) format= kPearson;
      else if (foundZuker) format= kZuker;
          /* digit ident: */
      else if (foundPhylip) format= kPhylip;
          /* spacing ident: */
      else if (foundFitch) format= kFitch;
          /* no format chars: */
      else if (otherlines > 0) format= kUnknown;
      else if (dnalines > 1) format= kPlain;
      else if (aminolines > 1) format= kPlain;
      else format= kUnknown;

      done= true;
      }

    /* need this for possible long header in olsen format */
     else if (strstr(sp,"): ") != NULL)
       maxlines2check++;
    }

  if (format == kPhylip) {
    /* check for interleaved or sequential -- really messy */
    int tname, tseq;
    long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
    char  *ps;

    rewind(fseq);
    for (i=0; i < *skiplines; i++) ReadOneLine(sp);
    nlines= 0;
    ReadOneLine(sp);
    sscanf( sp, "%d%d", &nspp, &nlen);
    ReadOneLine(sp); /* 1st seq line */
    for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;

    for (i= 1; i<nspp; i++) {
      ReadOneLine(sp);

      tseq= getseqtype(sp+10, strlen(sp+10));
      tname= getseqtype(sp, 10);
      for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
      for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;

      /* find probable interleaf or sequential ... */
      if (j>=9) seq += 10; /* pretty certain not ileaf */
      else {
        if (tseq != tname) leaf++; else seq++;
        if (tname == kDNA || tname == kRNA) seq++; else leaf++;
        }

      if (ilen <= nlen && j<9) {
        if (tname == kOtherSeq) leaf += 10;
        else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
        }
      else if (ilen > nlen) {
        ilen= 0;
        }
      }
    for ( nspp *= 2 ; i<nspp; i++) {  /* this should be only bases if interleaf */
      ReadOneLine(sp);

      tseq= getseqtype(sp+10, strlen(sp+10));
      tname= getseqtype(sp, 10);
      for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
      for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
      if (j<9) {
        if (tname == kOtherSeq) seq += 10;
        if (tseq != tname) seq++; else leaf++;
        if (tname == kDNA || tname == kRNA) leaf++; else seq++;
        }
      if (ilen > nlen) {
        if (j>9) leaf += 10; /* must be a name here for sequent */
        else if (tname == kOtherSeq) seq += 10;
        ilen= 0;
        }
      }

    if (leaf > seq) format= kPhylip4;
    else format= kPhylip2;
    }

  return(format);
#undef  ReadOneLine
} /* SeqFileFormat */




unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
/* GCGchecksum */
{
  register long  i, check = 0, count = 0;

  for (i = 0; i < seqlen; i++) {
    count++;
    check += count * to_upper(seq[i]);
    if (count == 57) count = 0;
    }
  check %= 10000;
  *checktotal += check;
  *checktotal %= 10000;
  return check;
}

/* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
const unsigned long crctab[] = {
  0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
  0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
  0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
  0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
  0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
  0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
  0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
  0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
  0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
  0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
  0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
  0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
  0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
  0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
  0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
  0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
  0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
  0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
  0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
  0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
  0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
  0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
  0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
  0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
  0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
  0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
  0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
  0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
  0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
  0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
  0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
  0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
  0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
  0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
  0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
  0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
  0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
  0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
  0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
  0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
  0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
  0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
  0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
  0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
  0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
  0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
  0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
  0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
  0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
  0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
  0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
  0x2d02ef8dL
};

unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
/*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
{
  register unsigned long c = 0xffffffffL;
  register long n = seqlen;

  while (n--) {
    c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
    seq++; /* fixed aug'98 finally */
    }
  c= c ^ 0xffffffffL;
  *checktotal += c;
  return c;
}




short getseqtype( const char *seq, const long seqlen)
{ /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
  char  c;
  short i, maxtest;
  short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;

  maxtest = min(300, seqlen);
  for (i = 0; i < maxtest; i++) {
    c = to_upper(seq[i]);
    if (strchr(protonly, c)) po++;
    else if (strchr(primenuc,c)) {
      na++;
      if (c == 'T') nt++;
      else if (c == 'U') nu++;
      }
    else if (strchr(aminos,c)) aa++;
    else if (strchr(seqsymbols,c)) ns++;
    else if (isalpha(c)) no++;
    }

  if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
  /* ?? test for probability of kOtherSeq ?, e.g.,
  else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
  */
  else if (po > 0) return kAmino;
  else if (aa == 0) {
    if (nu > nt) return kRNA;
    else return kDNA;
    }
  else if (na > aa) return kNucleic;
  else return kAmino;
} /* getseqtype */


char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
{
  register char *a, *b;
  register long i;
  char  *newseq;

  *newlen= 0;
  if (!seq) return NULL;
  newseq = (char*) malloc(seqlen+1);
  if (!newseq) return NULL;
  for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
    if (*a != gapc) {
      *b++= *a;
      i++;
      }
  *b= '\0';
  newseq = (char*) realloc(newseq, i+1);
  *newlen= i;
  return newseq;
}



/***
char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
{\\fonttbl\
  {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
  {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
  {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
  {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
{\\stylesheet\
  {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
  {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
  {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";

char *rtftail = "}";
****/

short writeSeq(FILE *outf, const char *seq, const long seqlen,
                const short outform, const char *seqid)
/* dump sequence to standard output */
{
  const short kSpaceAll = -9;
#define kMaxseqwidth  250

  boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
  short  numline = 0; /* only true if we are writing seq number line (for interleave) */
  boolean numright = false, numleft = false;
  boolean nameright = false, nameleft = false;
  short   namewidth = 8, numwidth = 8;
  short   spacer = 0, width  = 50, tab = 0;
  /* new parameters: width, spacer, those above... */

  short linesout = 0, seqtype = kNucleic;
  long  i, j, l, l1, ibase;
  char  idword[31], endstr[10];
  char  seqnamestore[128], *seqname = seqnamestore;
  char  s[kMaxseqwidth], *cp;
  char  nameform[10], numform[10], nocountsymbols[10];
  unsigned long checksum = 0, checktotal = 0;

  gPretty.atseq++;
  skipwhitespace(seqid);
  l = min(128, strlen(seqid));
  strncpy( seqnamestore, seqid, l);
  seqname[l] = 0;

  sscanf( seqname, "%30s", idword);
  sprintf(numform, "%d", seqlen);
  numwidth= strlen(numform)+1;
  nameform[0]= '\0';

  if (strstr(seqname,"checksum") != NULL) {
    cp = strstr(seqname,"bases");
    if (cp!=NULL) {
      for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
      if (cp!=seqname) *cp=0;
      }
    }

  strcpy( endstr,"");
  l1 = 0;

  if (outform == kGCG || outform == kMSF)
    checksum = GCGchecksum(seq, seqlen, &checktotal);
  else
    checksum = seqchecksum(seq, seqlen, &checktotal);

  switch (outform) {

    case kPlain:
    case kUnknown:    /* no header, just sequence */
      strcpy(endstr,"\n"); /* end w/ extra blank line */
      break;

    case kOlsen:  /* Olsen seq. editor takes plain nucs OR Genbank  */
    case kGenBank:
      fprintf(outf,"LOCUS       %s       %d bp\n", idword, seqlen);
      fprintf(outf,"DEFINITION  %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
   /* fprintf(outf,"ACCESSION   %s\n", accnum); */
      fprintf(outf,"ORIGIN      \n");
      spacer = 11;
      numleft = true;
      numwidth = 8;  /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
      strcpy(endstr, "\n//");
      linesout += 4;
      break;

    case kPIR:
      /* somewhat like genbank... \\\*/
      /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
      fprintf(outf,"ENTRY           %s \n", idword);
      fprintf(outf,"TITLE           %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
   /* fprintf(outf,"ACCESSION       %s\n", accnum); */
      fprintf(outf,"SEQUENCE        \n");
      numwidth = 7;
      width= 30;
      spacer = kSpaceAll;
      numleft = true;
      strcpy(endstr, "\n///");
      /* run a top number line for PIR */
      for (j=0; j<numwidth; j++) fputc(' ',outf);
      for (j= 5; j<=width; j += 5) fprintf(outf,"%10d",j);
      fputc('\n',outf);
      linesout += 5;
      break;

    case kNBRF:
      if (getseqtype(seq, seqlen) == kAmino)
        fprintf(outf,">P1;%s\n", idword);
      else
        fprintf(outf,">DL;%s\n", idword);
      fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
      spacer = 11;
      strcpy(endstr,"*\n");
      linesout += 3;
      break;

    case kEMBL:
      fprintf(outf,"ID   %s\n", idword);
  /*  fprintf(outf,"AC   %s\n", accnum); */
      fprintf(outf,"DE   %s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
      fprintf(outf,"SQ             %d BP\n", seqlen);
      strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
      tab = 4;     /** added 31jan91 */
      spacer = 11; /** added 31jan91 */
      width = 60;
      linesout += 4;
      break;

    case kGCG:
      fprintf(outf,"%s\n", seqname);
   /* fprintf(outf,"ACCESSION   %s\n", accnum); */
      fprintf(outf,"    %s  Length: %d  (today)  Check: %d  ..\n", idword, seqlen, checksum);
      spacer = 11;
      numleft = true;
      strcpy(endstr, "\n");  /* this is insurance to help prevent misreads at eof */
      linesout += 3;
      break;

    case kStrider: /* ?? map ?*/
      fprintf(outf,"; ### from DNA Strider ;-)\n");
      fprintf(outf,"; DNA sequence  %s, %d bases, %X checksum.\n;\n", seqname, seqlen, checksum);
      strcpy(endstr, "\n//");
      linesout += 3;
      break;

    case kFitch:
      fprintf(outf,"%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
      spacer = 4;
      width = 60;
      linesout += 1;
      break;

    case kPhylip2:
    case kPhylip4:
      /* this is version 3.2/3.4 -- simplest way to write
        version 3.3 is to write as version 3.2, then
        re-read file and interleave the species lines */
      if (strlen(idword)>10) idword[10] = 0;
      fprintf(outf,"%-10s  ",idword);
      l1  = -1;
      tab = 12;
      spacer = 11;
      break;

    case kASN1:
      seqtype= getseqtype(seq, seqlen);
      switch (seqtype) {
        case kDNA     : cp= "dna"; break;
        case kRNA     : cp= "rna"; break;
        case kNucleic : cp= "na"; break;
        case kAmino   : cp= "aa"; break;
        case kOtherSeq: cp= "not-set"; break;
        }
      fprintf(outf,"  seq {\n");
      fprintf(outf,"    id { local id %d },\n", gPretty.atseq);
      fprintf(outf,"    descr { title \"%s\" },\n", seqid);
      fprintf(outf,"    inst {\n");
      fprintf(outf,"      repr raw, mol %s, length %d, topology linear,\n", cp, seqlen);
      fprintf(outf,"      seq-data\n");
      if (seqtype == kAmino)
        fprintf(outf,"        iupacaa \"");
      else
        fprintf(outf,"        iupacna \"");
      l1  = 17;
      spacer = 0;
      width  = 78;
      tab  = 0;
      strcpy(endstr,"\"\n      } } ,");
      linesout += 7;
      break;

    case kPAUP:
      nameleft= true;
      namewidth = 9;
      spacer = 21;
      width  = 100;
      tab  = 0; /* 1; */
      /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
      /* do a header comment line for paup */
      fprintf(outf,"[Name: %-16s  Len:%6d  Check: %8X]\n", idword, seqlen, checksum);
      linesout += 1;
      break;

    case kPretty:
      numline= gPretty.numline;
      baseonlynum= gPretty.baseonlynum;
      namewidth = gPretty.namewidth;
      numright = gPretty.numright;
      numleft = gPretty.numleft;
      nameright = gPretty.nameright;
      nameleft = gPretty.nameleft;
      spacer = gPretty.spacer + 1;
      width  = gPretty.seqwidth;
      tab  = gPretty.tab;
      /* also add rtf formatting w/ font, size, style */
      if (gPretty.nametop) {
        fprintf(outf,"Name: %-16s  Len:%6d  Check: %8X\n", idword, seqlen, checksum);
        linesout++;
        }
      break;

    case kMSF:
      fprintf(outf," Name: %-16s Len:%6d  Check: %5d  Weight:  1.00\n",
                    idword, seqlen, checksum);
      linesout++;
      nameleft= true;
      namewidth= 15; /* need MAX namewidth here... */
      sprintf(nameform, "%%+%ds ",namewidth);
      spacer = 11;
      width  = 50;
      tab = 0; /* 1; */
      break;

    case kIG:
      fprintf(outf,";%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
      fprintf(outf,"%s\n", idword);
      strcpy(endstr,"1"); /* == linear dna */
      linesout += 2;
      break;

    default :
    case kZuker: /* don't attempt Zuker's ftn format */
    case kPearson:
      fprintf(outf,">%s, %d bases, %X checksum.\n", seqname, seqlen, checksum);
      linesout += 1;
      break;
    }

  if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
  if (numline) sprintf(numform, "%%%ds ",numwidth);
  else sprintf(numform, "%%%dd ",numwidth);
  strcpy( nocountsymbols, kNocountsymbols);
  if (baseonlynum) {
    if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
      strcat(nocountsymbols," ");
      nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
      }
    if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
      *cp= ' ';
      }
    }

  if (numline) {
   *idword= 0;
   }

  width = min(width,kMaxseqwidth);
  for (i=0, l=0, ibase = 1; i < seqlen; ) {

    if (l1 < 0) l1 = 0;
    else if (l1 == 0) {
      if (nameleft) fprintf(outf, nameform, idword);
      if (numleft) { if (numline) fprintf(outf, numform, "");
                    else fprintf(outf, numform, ibase);}
      for (j=0; j<tab; j++) fputc(' ',outf);
      }

    l1++;                 /* don't count spaces for width*/
    if (numline) {
      if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
        if (numline==1) fputc(' ',outf);
        s[l++] = ' ';
        }
      if (l1 % 10 == 1 || l1 == width) {
        if (numline==1) fprintf(outf,"%-9d ",i+1);
        s[l++]= '|'; /* == put a number here */
        }
      else s[l++]= ' ';
      i++;
      }

    else {
      if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
        s[l++] = ' ';
      if (!baseonlynum) ibase++;
      else if (0==strchr(nocountsymbols,seq[i])) ibase++;
      s[l++] = seq[i++];
      }

    if (l1 == width || i == seqlen) {
      if (outform==kPretty) for ( ; l1<width; l1++) {
        if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
          s[l++] = ' ';
        s[l++]=' '; /* pad w/ blanks */
        }
      s[l] = '\0';
      l = 0; l1 = 0;

      if (numline) {
        if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
        }
      else {
        if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
        else fprintf(outf,"%s",s);
        if (numright || nameright) fputc(' ',outf);
        if (numright)  fprintf(outf,numform, ibase-1);
        if (nameright) fprintf(outf, nameform,idword);
        }
      fputc('\n',outf);
      linesout++;
      }
    }
  return linesout;
}  /*writeSeq*/



/* End file: ureadseq.c */
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'ureadseq.h'" '(4672 characters)'
if test -f 'ureadseq.h'
then
	echo shar: will not over-write existing file "'ureadseq.h'"
else
cat << \SHAR_EOF > 'ureadseq.h'
/* File: ureadseq.h
 *
 * Header for module UReadSeq
 */

#ifndef UREADSEQ_H
#define UREADSEQ_H



typedef char  boolean;
#define NEWLINE         '\n'
#define false 0
#define true  1
#define min(a,b)      (a<b)?a:b
#define max(a,b)      (a>b)?a:b
#define skipwhitespace(string)  {while (*string <= ' ' && *string != 0) string++;}

  /* NLM strings */
#define is_upper(c) ('A'<=(c) && (c)<='Z')
#define is_lower(c) ('a'<=(c) && (c)<='z')
#define to_lower(c) ((char)(is_upper(c) ? (c)+' ' : (c)))
#define to_upper(c) ((char)(is_lower(c) ? (c)-' ' : (c)))


  /* readSeq errors */
#define eFileNotFound   -1
#define eNoData         -2
#define eMemFull        -3
#define eItemNotFound   -4
#define eOneFormat      -5
#define eUnequalSize    -6
#define eFileCreate     -7
#define eUnknownFormat  -8
#define eOptionBad      -9
#define eASNerr         -10

  /* magic number for readSeq(whichEntry) to give seq list */
#define kListSequences  -1

  /* sequence types parsed by getseqtype */
#define kOtherSeq   0
#define kDNA        1
#define kRNA        2
#define kNucleic    3
#define kAmino      4

  /* formats known to readSeq */
#define kIG             1
#define kGenBank        2
#define kNBRF           3
#define kEMBL           4
#define kGCG            5
#define kStrider        6
#define kFitch          7
#define kPearson        8
#define kZuker          9
#define kOlsen          10
#define kPhylip2        11
#define kPhylip4        12
#define kPhylip3        kPhylip4
#define kPhylip         kPhylip4
#define kPlain          13  /* keep this at #13 */
#define kPIR            14
#define kMSF            15
#define kASN1           16
#define kPAUP           17
#define kPretty         18

#define kMaxFormat      18
#define kMinFormat      1
#define kNoformat       -1    /* format not tested */
#define kUnknown        0     /* format not determinable */

  /* subsidiary types */
#define kASNseqentry    51
#define kASNseqset      52

#define kPhylipInterleave 61
#define kPhylipSequential 62


typedef struct  {
  boolean isactive, baseonlynum;
  boolean numright, numleft, numtop, numbot;
  boolean nameright, nameleft, nametop;
  boolean noleaves, domatch, degap;
  char  matchchar, gapchar;
  short numline, atseq;
  short namewidth, numwidth;
  short interline, spacer, seqwidth, tab;
  } prettyopts;

#define gPrettyInit(p) { \
  p.isactive=false;\
  p.baseonlynum=true;\
  p.numline= p.atseq= 0;\
  p.numright= p.numleft= p.numtop= p.numbot= false;\
  p.nameright= p.nameleft= p.nametop= false;\
  p.noleaves= p.domatch= p.degap= false;\
  p.matchchar='.';\
  p.gapchar='-';\
  p.namewidth=8;\
  p.numwidth=5;\
  p.interline=1;\
  p.spacer=10;\
  p.seqwidth=50;\
  p.tab=0; }

#ifdef UREADSEQ_G
prettyopts  gPretty;
#else
extern  prettyopts  gPretty;
#endif


#ifdef __cplusplus
extern "C" {
#endif

extern short seqFileFormat(const char *filename, long *skiplines, short *error );
extern short seqFileFormatFp(FILE *fseq, long  *skiplines, short *error );

extern char *listSeqs(const char *filename, const long skiplines,
                       const short format, short *nseq, short *error );

extern char *readSeq(const short whichEntry, const char *filename,
                      const long skiplines, const short format,
                      long *seqlen, short *nseq, short *error, char *seqid );

extern char *readSeqFp(const short whichEntry_, FILE  *fp_,
  const long  skiplines_, const short format_,
        long  *seqlen_,  short *nseq_, short *error_, char *seqid_ );

extern short writeSeq(FILE *outf, const char *seq, const long seqlen,
                       const short outform, const char *seqid );

extern unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal);
extern unsigned long GCGchecksum(const char *seq, const long seqlen, unsigned long *checktotal);
#ifdef SMALLCHECKSUM
#define seqchecksum  GCGchecksum
#else
#define seqchecksum  CRC32checksum
#endif

extern short getseqtype(const char *seq, const long seqlen );
extern char *compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen);

#ifdef NCBI

extern char *listASNSeqs(const char *filename, const long skiplines,
                  const short format, short *nseq, short *error );

extern char *readASNSeq(const short whichEntry, const char *filename,
                const long skiplines, const short format,
                long *seqlen, short *nseq, short *error, char **seqid );
#endif


  /* patches for some missing string.h stuff */
extern int Strcasecmp(const char *a, const char *b);
extern int Strncasecmp(const char *a, const char *b, long maxn);

#ifdef __cplusplus
}
#endif

#endif /*UREADSEQ_H*/

SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'ureadasn.c'" '(8560 characters)'
if test -f 'ureadasn.c'
then
	echo shar: will not over-write existing file "'ureadasn.c'"
else
cat << \SHAR_EOF > 'ureadasn.c'
/* ureadasn.c
  -- parse, mangle and otherwise rewrite ASN1 file/entries for readseq reading
  -- from NCBI toolkit (ncbi.nlm.nih.gov:/toolkit)
*/

#ifdef NCBI

#include <stdio.h>
#include <ctype.h>
#include <string.h>

/* NCBI toolkit :include: must be on lib path */
#include <ncbi.h>
#include <seqport.h>

#define UREADASN
#include "ureadseq.h"

#pragma segment ureadasn

/* this stuff is hacked up from tofasta.c of ncbitools */
#define   kBaseAny   0
#define   kBaseNucleic 1
#define   kBaseAmino   2

typedef struct tofasta {
    Boolean idonly;
    short   *seqnum;
    short   whichSeq;
    char    **seq, **seqid;
    long    *seqlen;
} FastaDat, PNTR FastaPtr;


void BioseqRawToRaw(BioseqPtr bsp, Boolean idonly,
              short whichSeq, short *seqnum,
              char **seq, char **seqid, long *seqlen)
{
  SeqPortPtr spp;
  SeqIdPtr bestid;
  Uint1 repr, code, residue;
  CharPtr tmp, title;
  long  outlen, outmax;
  char  localid[256], *sp;

  /* !!! this may be called several times for a single sequence
    because SeqEntryExplore looks for parts and joins them...
    assume seq, seqid, seqlen may contain data (or NULL)
  */
  if (bsp == NULL) return;
  repr = Bioseq_repr(bsp);
  if (!(repr == Seq_repr_raw || repr == Seq_repr_const)) return;

  (*seqnum)++;
  if (!(whichSeq == *seqnum || whichSeq == 0)) return;

  bestid = SeqIdFindBest(bsp->id, (Uint1) 0);
  title = BioseqGetTitle(bsp);
  if (idonly) {
    sprintf(localid, " %d)  ", *seqnum);
    tmp= localid + strlen(localid)-1;
    }
  else {
    strcpy(localid," ");
    tmp= localid;
    }
  tmp = SeqIdPrint(bestid, tmp, PRINTID_FASTA_SHORT);
  tmp = StringMove(tmp, " ");
  StringNCpy(tmp, title, 200);
/* fprintf(stderr,"BioseqRawToRaw: localid='%s'\n",localid); */

          /* < seqid is fixed storage */
  /* strcpy( *seqid, localid);  */
          /* < seqid is variable sized */
  outmax= strlen(localid) + 3;
  if (*seqid==NULL) {
    *seqid= (char*) malloc(outmax);
    if (*seqid==NULL) return;
    strcpy(*seqid, localid);
    }
  else {
    outmax += strlen(*seqid) + 2;
    *seqid= (char*) realloc( *seqid, outmax);
    if (*seqid==NULL) return;
    if (!idonly) strcat(*seqid, "; ");
    strcat(*seqid, localid);
    }

  if (idonly) {
    strcat(*seqid,"\n");
    return;
    }

  if (ISA_na(bsp->mol)) code = Seq_code_iupacna;
  else code = Seq_code_iupacaa;
  spp = SeqPortNew(bsp, 0, -1, 0, code);
  SeqPortSeek(spp, 0, SEEK_SET);

  sp= *seq;
  if (sp==NULL) {  /* this is always true now !? */
    outlen= 0;
    outmax= 500;
    sp= (char*) malloc(outmax);
    }
  else {
    outlen= strlen(sp);
    outmax= outlen + 500;
    sp= (char*) realloc( sp, outmax);
    }
  if (sp==NULL) return;

  while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) {
    if (outlen>=outmax) {
      outmax= outlen + 500;
      sp= (char*) realloc(sp, outmax);
      if (sp==NULL) return;
      }
    sp[outlen++] = residue;
    }
  sp= (char*) realloc(sp, outlen+1);
  if (sp!=NULL) sp[outlen]= '\0';
  *seq= sp;
  *seqlen= outlen;
  SeqPortFree(spp);
  return;
}


static void SeqEntryRawseq(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
{
  FastaPtr tfa;
  BioseqPtr bsp;

  if (!IS_Bioseq(sep)) return;
  bsp = (BioseqPtr)sep->data.ptrvalue;
  tfa = (FastaPtr) data;
  BioseqRawToRaw(bsp, tfa->idonly, tfa->whichSeq, tfa->seqnum,
                  tfa->seq, tfa->seqid, tfa->seqlen);
}

void SeqEntryToRaw(SeqEntryPtr sep, Boolean idonly, short whichSeq, short *seqnum,
                        char **seq, char **seqid, long *seqlen)
{
  FastaDat tfa;

  if (sep == NULL) return;
  tfa.idonly= idonly;
  tfa.seqnum= seqnum;
  tfa.whichSeq= whichSeq;
  tfa.seq   = seq;
  tfa.seqid = seqid;
  tfa.seqlen= seqlen;
  SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryRawseq);
}




char *listASNSeqs(const char *filename, const long skiplines,
                  const short format,   /* note: this is kASNseqentry or kASNseqset */
                  short *nseq, short *error )
{
  AsnIoPtr aip = NULL;
  SeqEntryPtr the_set;
  AsnTypePtr atp, atp2;
  AsnModulePtr amp;
  Boolean inIsBinary= FALSE; /* damn, why can't asn routines test this? */
  char  *seq = NULL;
  char  *seqid = NULL, stemp[256];
  long  seqlen;
  int   i, count;

  *nseq= 0;
  *error= 0;

    /* asn dictionary setups */
/*fprintf(stderr,"listASNSeqs: SeqEntryLoad\n");*/
  if (! SeqEntryLoad()) goto errxit; /*  sequence alphabets (and sequence parse trees) */
  amp = AsnAllModPtr();   /* get pointer to all loaded ASN.1 modules */
  if (amp == NULL) goto errxit;
  atp = AsnFind("Bioseq-set");    /* get the initial type pointers */
  if (atp == NULL) goto errxit;
  atp2 = AsnFind("Bioseq-set.seq-set.E");
  if (atp2 == NULL) goto errxit;

/*fprintf(stderr,"listASNSeqs: AsnIoOpen\n");*/
      /* open the ASN.1 input file in the right mode */
      /* !!!! THIS FAILS when filename has MAC PATH (& other paths?) (:folder:filename) */
  if ((aip = AsnIoOpen(filename, inIsBinary?"rb":"r")) == NULL) goto errxit;
  for (i=0; i<skiplines; i++) fgets( stemp, 255, aip->fp);  /* this may mess up asn routines... */

  if (! ErrSetLog ("stderr"))  goto errxit;
  else ErrSetOpts(ERR_CONTINUE, ERR_LOG_ON);    /*??  log errors instead of die */

  if (format == kASNseqentry) {  /* read one Seq-entry */
/*fprintf(stderr,"listASNSeqs: SeqEntryAsnRead\n");*/
    the_set = SeqEntryAsnRead(aip, NULL);
    SeqEntryToRaw(the_set, true,  0, nseq, &seq, &seqid, &seqlen);
    if (seq) free(seq); seq= NULL;
    SeqEntryFree(the_set);
    }
  else   {                   /* read Seq-entry's from a Bioseq-set */
    count = 0;
/*fprintf(stderr,"listASNSeqs: AsnReadId\n");*/
    while ((atp = AsnReadId(aip, amp, atp)) != NULL) {
      if (atp == atp2)  {  /* top level Seq-entry */
        the_set = SeqEntryAsnRead(aip, atp);
        SeqEntryToRaw(the_set, true, 0, nseq, &seq, &seqid, &seqlen);
        SeqEntryFree(the_set);
        if (seq) free(seq); seq= NULL;
        }
      else
        AsnReadVal(aip, atp, NULL);
      count++;
      }
    }

  AsnIoClose(aip);
  *error= 0;
  return seqid;

errxit:
  AsnIoClose(aip);
  if (seqid) free(seqid);
  *error= eASNerr;
  return NULL;
}


char *readASNSeq(const short whichEntry, const char *filename,
                const long skiplines,
                const short format,     /* note: this is kASNseqentry or kASNseqset */
                long *seqlen, short *nseq,
                short *error, char **seqid )
{
  AsnIoPtr aip = NULL;
  SeqEntryPtr the_set;
  AsnTypePtr atp, atp2;
  AsnModulePtr amp;
  Boolean inIsBinary= FALSE; /* damn, why can't asn routines test this? */
  char  *seq, stemp[200];
  int   i, count;

  *seqlen= 0;
  *nseq= 0;
  *error= 0;
  seq= NULL;

/*fprintf(stderr,"readASNseq: SeqEntryLoad\n");*/
    /* asn dictionary setups */
  if (! SeqEntryLoad()) goto errxit; /*  sequence alphabets (and sequence parse trees) */
  amp = AsnAllModPtr();   /* get pointer to all loaded ASN.1 modules */
  if (amp == NULL) goto errxit;
  atp = AsnFind("Bioseq-set");    /* get the initial type pointers */
  if (atp == NULL) goto errxit;
  atp2 = AsnFind("Bioseq-set.seq-set.E");
  if (atp2 == NULL) goto errxit;

      /* open the ASN.1 input file in the right mode */
/*fprintf(stderr,"readASNseq: AsnIoOpen(%s)\n", filename);*/
  if ((aip = AsnIoOpen(filename, inIsBinary?"rb":"r")) == NULL) goto errxit;
  for (i=0; i<skiplines; i++) fgets( stemp, 255, aip->fp);  /* this may mess up asn routines... */

  if (! ErrSetLog ("stderr"))  goto errxit;
  else ErrSetOpts(ERR_CONTINUE, ERR_LOG_ON);    /*??  log errors instead of die */

  seq= NULL;
  if (format == kASNseqentry) {  /* read one Seq-entry */
/*fprintf(stderr,"readASNseq: SeqEntryAsnRead\n");*/
    the_set = SeqEntryAsnRead(aip, NULL);
    SeqEntryToRaw(the_set, false, whichEntry, nseq, &seq, seqid, seqlen);
    SeqEntryFree(the_set);
    goto goodexit;
    }

  else   {                   /* read Seq-entry's from a Bioseq-set */
    count = 0;
/*fprintf(stderr,"readASNseq: AsnReadId\n");*/
    while ((atp = AsnReadId(aip, amp, atp)) != NULL) {
      if (atp == atp2)  {  /* top level Seq-entry */
        the_set = SeqEntryAsnRead(aip, atp);
        SeqEntryToRaw(the_set, false, whichEntry, nseq, &seq, seqid, seqlen);
        SeqEntryFree(the_set);
        if (*nseq >= whichEntry) goto goodexit;
        }
      else
        AsnReadVal(aip, atp, NULL);
      count++;
      }
    }

goodexit:
  AsnIoClose(aip);
  *error= 0;
  return seq;

errxit:
  AsnIoClose(aip);
  *error= eASNerr;
  if (seq) free(seq);
  return NULL;
}


#endif /*NCBI*/
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Readme'" '(6269 characters)'
if test -f 'Readme'
then
	echo shar: will not over-write existing file "'Readme'"
else
cat << \SHAR_EOF > 'Readme'

 * ReadSeq  -- 1 Feb 93
 *
 * Reads and writes nucleic/protein sequences in various
 * formats. Data files may have multiple sequences.
 *
 * Copyright 1990 by d.g.gilbert
 * biology dept., indiana university, bloomington, in 47405
 * e-mail: gilbertd@bio.indiana.edu
 *
 * This program may be freely copied and used by anyone.
 * Developers are encourged to incorporate parts in their
 * programs, rather than devise their own private sequence
 * format.
 *
 * This should compile and run with any ANSI C compiler.
 * Please advise me of any bugs, additions or corrections.

Readseq has been updated.   There have been a number of enhancements
and a few bug corrections since the previous general release in Nov 91
(see below).  If you are using earlier versions, I recommend you update to
this release.

Readseq is particularly useful as it automatically detects many
sequence formats, and interconverts among them.
Formats added to this release include
  + MSF multi sequence format used by GCG software
  + PAUP's multiple sequence (NEXUS) format
  + PIR/CODATA format used by PIR
  + ASN.1 format used by NCBI
  + Pretty print with various options for nice looking output.

As well, Phylip format can now be used as input.  Options to
reverse-compliment and to degap sequences have been added.  A menu
addition for users of the GDE sequence editor is included.

This program is available thru Internet gopher, as

  gopher ftp.bio.indiana.edu
  browse into the IUBio-Software+Data/molbio/readseq/ folder
  select the readseq.shar document

Or thru anonymous FTP in this manner:
  my_computer> ftp  ftp.bio.indiana.edu  (or IP address 129.79.224.25)
    username:  anonymous
    password:  my_username@my_computer
  ftp> cd molbio/readseq
  ftp> get readseq.shar
  ftp> bye

readseq.shar is a Unix shell archive of the readseq files.
This file can be editted by any text editor to reconstitute the
original files, for those who do not have a Unix system or an
Unshar program.  Read the top of this .shar file for further
instructions.

There are also pre-compiled executables for the following computers:
Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax,
Macintosh. Use binary ftp to transfer these, except Macintosh.  The
Mac version is just the command-line program in a window, not very
handy.

C source files:
  readseq.c ureadseq.c ureadasn.c ureadseq.h
Document files:
  Readme (this doc)
  Readseq.help (longer than this doc)
  Formats (description of sequence file formats)
  add.gdemenu (GDE program users can add this to the .GDEmenu file)
  Stdfiles -- test sequence files
  Makefile -- Unix make file
  Make.com -- VMS make file
  *.std    -- files for testing validity of readseq


Example usage:
  readseq
      -- for interactive use
  readseq my.1st.seq  my.2nd.seq  -all  -format=genbank  -output=my.gb
      -- convert all of two input files to one genbank format output file
  readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match
      -- output to standard output a file in a pretty format
  readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev
      -- select 4 items from input, degap, reverse, and uppercase them
  cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn
      -- pipe a bunch of data thru readseq, converting all to asn


The brief usage of readseq is as follows. The "[]" denote
optional parts of the syntax:

  readseq -help
readSeq (27Dec92), multi-format molbio sequence reader.
usage: readseq [-options] in.seq > out.seq
 options
    -a[ll]         select All sequences
    -c[aselower]   change to lower case
    -C[ASEUPPER]   change to UPPER CASE
    -degap[=-]     remove gap symbols
    -i[tem=2,3,4]  select Item number(s) from several
    -l[ist]        List sequences only
    -o[utput=]out.seq  redirect Output
    -p[ipe]        Pipe (command line, <stdin, >stdout)
    -r[everse]     change to Reverse-complement
    -v[erbose]     Verbose progress
    -f[ormat=]#    Format number for output,  or
    -f[ormat=]Name Format name for output:
         1. IG/Stanford           10. Olsen (in-only)
         2. GenBank/GB            11. Phylip3.2
         3. NBRF                  12. Phylip
         4. EMBL                  13. Plain/Raw
         5. GCG                   14. PIR/CODATA
         6. DNAStrider            15. MSF
         7. Fitch                 16. ASN.1
         8. Pearson/Fasta         17. PAUP
         9. Zuker                 18. Pretty (out-only)

   Pretty format options:
    -wid[th]=#            sequence line width
    -tab=#                left indent
    -col[space]=#         column space within sequence line on output
    -gap[count]           count gap chars in sequence numbers
    -nameleft, -nameright[=#]   name on left/right side [=max width]
    -nametop              name at top/bottom
    -numleft, -numright   seq index on left/right side
    -numtop, -numbot      index on top/bottom
    -match[=.]            use match base for 2..n species
    -inter[line=#]        blank line(s) between sequence blocks



Recent changes:

4 May 92
+ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
Aug 92
= fixed Olsen format input to handle files w/ more sequences,
  not to mess up when more than one seq has same identifier,
  and to convert number masks to symbols.
= IG format fix to understand ^L
30 Dec 92
* revised command-line & interactive interface.  Suggested form is now
    readseq infile -format=genbank -output=outfile -item=1,3,4 ...
  but remains compatible with prior commandlines:
    readseq infile -f2 -ooutfile -i3 ...
+ added GCG MSF multi sequence file format
+ added PIR/CODATA format
+ added NCBI ASN.1 sequence file format
+ added Pretty, multi sequence pretty output (only)
+ added PAUP multi seq format
+ added degap option
+ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
+ added support for reading Phylip formats (interleave & sequential)
* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version

1Feb93
= reverted Genbank output format to fixed left margin 
  (change in 30 Dec release), so GDE and others relying on fixed margin
  can read this.
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Readseq.help'" '(9345 characters)'
if test -f 'Readseq.help'
then
	echo shar: will not over-write existing file "'Readseq.help'"
else
cat << \SHAR_EOF > 'Readseq.help'

 * ReadSeq.Help -- 30 Dec 92
 *
 * Reads and writes nucleic/protein sequences in various
 * formats. Data files may have multiple sequences.
 *
 * Copyright 1990 by d.g.gilbert
 * biology dept., indiana university, bloomington, in 47405
 * e-mail: gilbertd@bio.indiana.edu
 *
 * This program may be freely copied and used by anyone.
 * Developers are encourged to incorporate parts in their
 * programs, rather than devise their own private sequence
 * format.
 *
 * This should compile and run with any ANSI C compiler.
 * Please advise me of any bugs, additions or corrections.

Readseq is particularly useful as it automatically detects many
sequence formats, and interconverts among them.

Formats which readseq currently understands:

  * IG/Stanford, used by Intelligenetics and others
  * GenBank/GB, genbank flatfile format
  * NBRF format
  * EMBL, EMBL flatfile format
  * GCG, single sequence format of GCG software
  * DNAStrider, for common Mac program
  * Fitch format, limited use
  * Pearson/Fasta, a common format used by Fasta programs and others
  * Zuker format, limited use. Input only.
  * Olsen, format printed by Olsen VMS sequence editor. Input only.
  * Phylip3.2, sequential format for Phylip programs
  * Phylip, interleaved format for Phylip programs (v3.3, v3.4)
  * Plain/Raw, sequence data only (no name, document, numbering)
  + MSF multi sequence format used by GCG software
  + PAUP's multiple sequence (NEXUS) format
  + PIR/CODATA format used by PIR
  + ASN.1 format used by NCBI
  + Pretty print with various options for nice looking output. Output only.

See the included "Formats" file for detail on file formats.


Example usage:
  readseq
      -- for interactive use

  readseq my.1st.seq  my.2nd.seq  -all  -format=genbank  -output=my.gb
      -- convert all of two input files to one genbank format output file

  readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match
      -- output to standard output a file in a pretty format

  readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev
      -- select 4 items from input, degap, reverse, and uppercase them

  cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn
      -- pipe a bunch of data thru readseq, converting all to asn


The brief usage of readseq is as follows. The "[]" denote
optional parts of the syntax:

readseq -help
readSeq (27Dec92), multi-format molbio sequence reader.
usage: readseq [-options] in.seq > out.seq
 options
    -a[ll]         select All sequences
    -c[aselower]   change to lower case
    -C[ASEUPPER]   change to UPPER CASE
    -degap[=-]     remove gap symbols
    -i[tem=2,3,4]  select Item number(s) from several
    -l[ist]        List sequences only
    -o[utput=]out.seq  redirect Output
    -p[ipe]        Pipe (command line, <stdin, >stdout)
    -r[everse]     change to Reverse-complement
    -v[erbose]     Verbose progress
    -f[ormat=]#    Format number for output,  or
    -f[ormat=]Name Format name for output:
         1. IG/Stanford           10. Olsen (in-only)
         2. GenBank/GB            11. Phylip3.2
         3. NBRF                  12. Phylip
         4. EMBL                  13. Plain/Raw
         5. GCG                   14. PIR/CODATA
         6. DNAStrider            15. MSF
         7. Fitch                 16. ASN.1
         8. Pearson/Fasta         17. PAUP
         9. Zuker                 18. Pretty (out-only)

   Pretty format options:
    -wid[th]=#            sequence line width
    -tab=#                left indent
    -col[space]=#         column space within sequence line on output
    -gap[count]           count gap chars in sequence numbers
    -nameleft, -nameright[=#]   name on left/right side [=max width]
    -nametop              name at top/bottom
    -numleft, -numright   seq index on left/right side
    -numtop, -numbot      index on top/bottom
    -match[=.]            use match base for 2..n species
    -inter[line=#]        blank line(s) between sequence blocks


Notes:

In use, readseq will respond to command line arguments, or to
interactive use.  Command line arguments cannot be combined
but must each follow a switch character (-).  In this release,
the command line options are now words, with an equals (=)
to separate parameter(s) fromt he command.  You cannot put a
space between a command and its parameter, as is usual for
Unix programs (this is to preserve compatibility with VMS).
The command line syntax of the earlier versions is still
supported.

See the file Formats for details of the sequence formats which
are supported by readseq.  The auto-detection feature of
readseq which distinguishes these formats looks for some of the
unique keywords and symbols that are found in each format. It
is not infallible at this, though it attempts to exclude unknown
formats.  In general, if you feed to readseq a sequence file that
you know is one of these common formats, you are okay.  If you feed
it data that might be oddball formats, or non-sequence data,
you might well get garbage results.  Also, different developers
are always thinking up minor twists on these common formats
(like PAUP requiring a blank line between blocks of Phylip format,
or IG adding form feeds between sequences), which may cause hassles.

In general, output supports only minimal subsets of each format
needed for sequence data exchanges.  Features, descriptions
and other format-unique information is discarded.

The pretty format requires additional options to generate a
nice output.  Try the various pretty options to see what you like.
Pretty format is OUPUT only, readseq cannot read a Pretty format
file.

Readseq is NOT optimized for LARGE files.  It generally makes several
reads thru each input file (one per sequence output at present, future
version may optimize this).  It should handle input and output files
and sequences of any size, but will slow down quite a bit for very large
(multi megabyte) sized files. It is NOT recommended for converting
databanks or large subsets there-of.  It is primarily directed at the
small files that researchers use to maintain their personal data, which
they frequently need to interconvert for the various analysis programs
which so frequently require a special format.

Users of Olsen multi sequence editor (VMS).  The Olsen format
here is produced with the print command:
  print/out=some.file
Use Genbank output from readseq to produce a format that this
editor can read, and use the command
  load/genbank some.file
Dan Davison has a VMS program that will convert to/from the
Olsen native binary data format.  E-mail davison@uh.edu

Warning: Phylip format input is now supported (30Dec92), however the
auto-detection of Phylip format is very probabilistic and messy,
especially distinguishing sequential from interleaved versions. It
is not recommended that one use readseq to convert files from Phylip
format to others unless essential.


This program is available thru Internet gopher, as

  gopher ftp.bio.indiana.edu
  browse into the IUBio-Software+Data/molbio/readseq/ folder
  select the readseq.shar document

Or thru anonymous FTP in this manner:
  my_computer> ftp  ftp.bio.indiana.edu  (or IP address 129.79.224.25)
    username:  anonymous
    password:  my_username@my_computer
  ftp> cd molbio/readseq
  ftp> get readseq.shar
  ftp> bye

readseq.shar is a Unix shell archive of the readseq files.
This file can be editted by any text editor to reconstitute the
original files, for those who do not have a Unix system or an
Unshar program.  Read the top of this .shar file for further
instructions.

There are also pre-compiled executables for the following computers:
Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax,
Macintosh. Use binary ftp to transfer these, except Macintosh.  The
Mac version is just the command-line program in a window, not very
handy.

C source files:
  readseq.c ureadseq.c ureadasn.c ureadseq.h

Document files:
  Readme (this doc)
  Formats (description of sequence file formats)
  add.gdemenu (GDE program users can add this to the .GDEmenu file)
  Stdfiles -- test sequence files
  Makefile -- Unix make file
  Make.com -- VMS make file
  *.std    -- files for testing validity of readseq


Recent changes (see also readseq.c for all history of changes):

4 May 92
+ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
Aug 92
= fixed Olsen format input to handle files w/ more sequences,
  not to mess up when more than one seq has same identifier,
  and to convert number masks to symbols.
= IG format fix to understand ^L
30 Dec 92
* revised command-line & interactive interface.  Suggested form is now
    readseq infile -format=genbank -output=outfile -item=1,3,4 ...
  but remains compatible with prior commandlines:
    readseq infile -f2 -ooutfile -i3 ...
+ added GCG MSF multi sequence file format
+ added PIR/CODATA format
+ added NCBI ASN.1 sequence file format
+ added Pretty, multi sequence pretty output (only)
+ added PAUP multi seq format
+ added degap option
+ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
+ added support for reading Phylip formats (interleave & sequential)
* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version


SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Formats'" '(39136 characters)'
if test -f 'Formats'
then
	echo shar: will not over-write existing file "'Formats'"
else
cat << \SHAR_EOF > 'Formats'
||||||||||| ReadSeq supported formats   (revised 30Dec92)
--------------------------------------------------------

    -f[ormat=]Name Format name for output:
         1. IG/Stanford           10. Olsen (in-only)
         2. GenBank/GB            11. Phylip3.2
         3. NBRF                  12. Phylip
         4. EMBL                  13. Plain/Raw
         5. GCG                   14. PIR/CODATA
         6. DNAStrider            15. MSF
         7. Fitch                 16. ASN.1
         8. Pearson/Fasta         17. PAUP
         9. Zuker (in-only)       18. Pretty (out-only)

In general, output supports only minimal subsets of each format
needed for sequence data exchanges.  Features, descriptions
and other format-unique information is discarded.

Users of Olsen multi sequence editor (VMS).  The Olsen format
here is produced with the print command:
  print/out=some.file
Use Genbank output from readseq to produce a format that this
editor can read, and use the command
  load/genbank some.file
Dan Davison has a VMS program that will convert to/from the
Olsen native binary data format.  E-mail davison@uh.edu

Warning: Phylip format input is now supported (30Dec92), however the
auto-detection of Phylip format is very probabilistic and messy,
especially distinguishing sequential from interleaved versions. It
is not recommended that one use readseq to convert files from Phylip
format to others unless essential.



||||||||||| ReadSeq usage             (revised 11Nov91)
--------------------------------------------------------

A. determine file format:

        short skiplines;  /* result: number of header lines to skip (or 0) */
        short error;      /* error result or 0 */
        short format;     /* resulting format code, see ureadseq.h */
        char  *filename   = "Mysequence.file"

        format = seqFileFormat( filename, &skiplines, &error);
        if (error!=0) fail;

B. read number and list of sequences (optional)
        short numseqs;    /* resulting number of sequences found in file */
        char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */

        seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
        if (error!=0)  display (seqlist);
        free( seqlist);

C.  read individual sequences as desired
        short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */
        long  seqlen;     /* length of seq */
        char  seqid[256]; /* sequence name */
        char  *seq;       /* sequence, 0 terminated, free when done */

        seq = readSeq( seqIndex, filename, skiplines, format,
                      &seqlen, &numseqs, &error, seqid);
        if (error!=0) manipulate(seq);
        free(seq);

D. write sequences as desired
        int nlines;     /* number of lines of sequence written */
        FILE* fout;     /* open file pointer (stdout or other) */
        short outform;  /* output format, see ureadseq.h */

        nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);


Note (30Dec92): There is various processing done by the main program (in readseq.c),
  rather than just in the subroutines (in ureadseq.c).  Especially for interleaved
  output formats, the writeSeq subroutine does not handle interleaving, nor some of
  the formatting at the top and end of output files.  While seqFileFormat, listSeqs,
  and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
  auxilliary processing.  At some point, this may be revised so writeSeq is self-
  contained.

Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
  reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written
  by writeSeq alone.



|||||||||||  sequence formats....
---------------------------------------------------

stanford/IG
;comments
;...
seq1 info
abcd...
efgh1 (or 2 = terminator)
;another seq
;....
seq2 info
abcd...1
--- for e.g. ----
;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  ..
dro5stseq
GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1

;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120
---------------------------------------------------

Genbank:
LOCUS    seq1 ID..
...
ORIGIN ...
123456789abcdefg....(1st 9 columns are formatting)
     hijkl...
//         (end of sequence)
LOCUS     seq2 ID ..
...
ORIGIN
      abcd...
//
---------------------------------------------------

NBRF format: (from uwgcg ToNBRF)
>DL;DRO5SRNA
Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA

      51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
     101  AACACCGCGU GUUGUUGGCC U

---------------------------------------------------

EMBL format
ID345 seq1 id   (the 345 are spaces)
... other info
SQ345Sequence   (the 3,4,5 are spaces)
abcd...
hijk...
//              (! this is proper end string: 12Oct90)
ID    seq2 id
...
SQ   Sequence
abcd...
...
//
---------------------------------------------------

UW GCG Format:
comments of any form, up to ".." signal
signal line has seq id, and " Check: ####   .."
only 1 seq/file

-- e.g. --- (GCG from GenBank)
LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
    ... much more ...
ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1.

INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  ..

       1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT

      51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG


---------------------------------------------------

DNAStrider (Mac) = modified Stanford:
; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM
; DNA sequence  pBR322   4363  b.p. complete sequence
;
abcd...
efgh
//  (end of sequence)
---------------------------------------------------

Fitch format:
Dro5srna.Seq
 GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
 GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
Droest6.Seq
 GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
 AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
---------------------------------------------------

W.Pearson/Fasta format:
>BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides.
TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT

---------------------------------------------------
Phylip version 3.2 format (e.g., DNAML):

   5   13 YF                (# seqs, #bases, YF)
Alpha     AACGTGGCCAAAT
          aaaagggccc...  (continued sp. alpha)
Beta      AAGGTCGCCAAAC
          aaaagggccc...  (continued sp. beta)
Gamma     CATTTCGTCACAA
          aaaagggccc...  (continued sp. Gamma)
1234567890^-- bases must start in col 11, and run 'til #bases 
        (spaces & newlines are okay)
---------------------------------------------------
Phylip version 3.3 format (e.g., DNAML):

  5    42  YF             (# seqs, #bases, YF)
Turkey    AAGCTNGGGC ATTTCAGGGT
Salmo gairAAGCCTTGGC AGTGCAGGGT
H. SapiensACCGGTTGGC CGTTCAGGGT
Chimp     AAACCCTTGC CGTTACGCTT
Gorilla   AAACCCTTGC CGGTACGCTT
1234567890^-- bases must start in col 11
  !! this version interleaves the species -- contrary to
     all other output formats.

GAGCCCGGGC AATACAGGGT AT
GAGCCGTGGC CGGGCACGGT AT
ACAGGTTGGC CGTTCAGGGT AA
AAACCGAGGC CGGGACACTC AT
AAACCATTGC CGGTACGCTT AA

---------------------------------------------------
Phylip version 3.4 format (e.g., DNAML)
-- Both Interleaved and sequential are permitted

   5   13                (# seqs, #bases)
Alpha     AACGTGGCCAAAT
          aaaagggccc...  (continued sp. alpha)
Beta      AAGGTCGCCAAAC
          aaaagggccc...  (continued sp. beta)
Gamma     CATTTCGTCACAA
          aaaagggccc...  (continued sp. Gamma)
1234567890^-- bases must start in col 11, and run 'til #bases 
        (spaces, newlines and numbers are are ignored)

---------------------------------------------------
Gary Olsen (multiple) sequence editor /print format:

!---------------------
!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
! here is correct copy:
  301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop
123456789012345678901
  301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp

  301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela
!---------------------

 RNase P RNA components.  on 20-FEB-90 17:23:58

    1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA.
    2 (chrom   ):  Chromatium
      :
   12 (B.brevis):  Bacillus brevis RNase P RNA, B. James.
   13 ( 90% con):   90% conserved
   14 (100% con):  100% conserved
   15 (gram+ pr):  pairing

1
 RNase P RNA components.  on 20-FEB-90 17:23:58

 Posi-   Sequence
 tion:   identity:   Data:

     1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr
     1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom
            :
     1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis
1234567890123456789012 <! this should be 21 not 22,
! this example must be inset on left by 1 space from olsen /print files !
     1  13  90% con           G  C G  A  CGC GC               -    -      90% con
     1  14 100% con                G  A  CGC                             100% con
     1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr

    60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr
    60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom
    :       :
    60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo


---------------------------------------------------
  GCG MSF format
Title line

picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541
..
Name: Cb3              Len:   100  Check: 7009  Weight:  1.00
Name: E                Len:   100  Check:   60  Weight:  1.00

//

   1                                                   50
Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
  E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs

   51                                                 100

Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
  E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf

---------------------------------------------------
     PIR format
This is NBRF-PIR MAILSERVER version 1.45
Command-> get PIR3:A31391
\\\
ENTRY           A31391       #Type Protein
TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster)

DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
PLACEMENT          0.0    0.0    0.0    0.0    0.0
COMMENT         *This entry is not verified.
SOURCE          Drosophila melanogaster

REFERENCE
   #Authors     Cooke P.H., Oakeshott J.G.
   #Citation    submitted to GenBank, April 1989
   #Reference-number A31391
   #Accession   A31391
   #Cross-reference GB:J04167

SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679
SEQUENCE
                5        10        15        20        25        30
      1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
     31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
     61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
     91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
    121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
    151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
    181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
    211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
    241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
    271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
    301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
    331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
    361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
    391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
    421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
    451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
    481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
    511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
    541 V E F P
///
\\\
---------------------------------------------------
PAUP format:
The NEXUS Format

Every block starts with "BEGIN blockname;" and ends with "END;".
Each block is composed of one or more statements, each
terminated by a semicolon (;).

Comments may be included in NEXUS files by enclosing them within
square brackets, as in "[This is a comment]."

NEXUS-conforming files are identified by a "#NEXUS" directive at
the very beginning of the file (line 1, column 1).  If the
#NEXUS is omitted PAUP issues a warning but continues
processing.

NEXUS files are entirely free-format.  Blanks, tabs, and
newlines may be placed anywhere in the file.  Unless RESPECTCASE
is requested, commands and data may be entered in upper case,
lower case, or a mixture of upper and lower case.

The following conventions are used in the syntax descriptions of
the various blocks.  Upper-case items are entered exactly as
shown.  Lower-case items inside of angle brackets -- e.g., <x>
-- represent items to be substituted by the user.  Items inside
of square brackets -- e.g., [X] -- are optional.  Items inside
of curly braces and separated by vertical bars -- e.g.,  { X | Y
| Z } -- are mutually exclusive options.


The DATA Block

The DATA block contains the data matrix and other associated
information.  Its syntax is:

BEGIN DATA;
DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
  [ FORMAT  [ MISSING=<missing-symbol> ]
        [ LABELPOS={ LEFT | RIGHT } ]
        [ SYMBOLS="<symbols-list>" ]
        [ INTERLEAVE ]
        [ MATCHCHAR=<match-symbol> ]
        [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
        [ TRANSPOSE ]
        [ RESPECTCASE ]
        [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
        [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
        [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
        [ ZAP = "<list of zapped characters>" ] ; ]
  [ CHARLABELS <label_1> label_2>ÊÉ <label_NCHAR> ; ]
  [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
  [ STATELABELS <currently ignored by PAUP> ; ]
  MATRIX <data-matrix> ;
  END;

--- example PAUP file

#NEXUS

[!Brown et al. (1982) primate mitochondrial DNA]

begin data;
  dimensions ntax=5 nchar=896;
  format datatype=dna matchchar=. interleave missing='-';
  matrix
[                              2                    4                    6            8                    ]
[         1                    1                    1                    1            1                    ]
human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c.........
gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...

[         8                    8                    8                    8            8              8     ]
[         0                    2                    4                    6            8              9     ]
[         1                    1                    1                    1            1              6     ]
human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
chimp     t................... .a................c. ........a.....g..... ...a................ ................
gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
  ;
end;
---------------------------------------------------






|||||||||||  Sample SMTP mail header
---------------------------------------------------

- - - - - - - - -
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
Received: from genbank.bio.net by sunflower.bio.indiana.edu
        (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
Received: by genbank.bio.net (5.65/IG-2.0)
        id AA14458; Sun, 10 Nov 91 14:30:03 -0800
Date: Sun, 10 Nov 91 14:30:03 -0800
Message-Id: <9111102230.AA14458@genbank.bio.net>
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
To: gilbertd@sunflower.bio.indiana.edu
Subject: Results of Query for drorna
Status: R

No matches on drorna.
- - - - - -
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
Received: from genbank.bio.net by sunflower.bio.indiana.edu
        (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
Received: by genbank.bio.net (5.65/IG-2.0)
        id AA14461; Sun, 10 Nov 91 14:30:03 -0800
Date: Sun, 10 Nov 91 14:30:03 -0800
Message-Id: <9111102230.AA14461@genbank.bio.net>
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
To: gilbertd@sunflower.bio.indiana.edu
Subject: Results of Query for droest6
Status: R

LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
DEFINITION  D.melanogaster esterase-6 mRNA, complete cds.
ACCESSION   M15961












|||||||||||  GCG manual discussion of sequence symbols:
---------------------------------------------------

III_SEQUENCE_SYMBOLS


     GCG programs allow all upper and lower  case  letters,  periods  (.),
asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in
biological sequences.  Nucleotide  symbols,  their  complements,  and  the
standard  one-letter amino acid symbols are shown below in separate lists.
The meanings of the symbols +, &, and @ have not  been  assigned  at  this
writing (March, 1989).

     GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide
ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985,
Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes
used by the EMBL, GenBank, and NBRF data libraries.


                               NUCLEOTIDES

     The meaning of each symbol, its complement,  and  the  Cambridge  and
Stanford  equivalents  are  shown below.  Cambridge files can be converted
into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN.
IntelliGenetics  sequence  files  can  be interconverted with the programs
FROMIG and TOIG.

IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford

   A             A             T             A            A
   C             C             G             C            C
   G             G             C             G            G
  T/U            T             A             T           T/U
   M           A or C          K             5            J
   R           A or G          Y             R            R
   W           A or T          W             7            L
   S           C or G          S             8            M
   Y           C or T          R             Y            Y
   K           G or T          M             6            K
   V        A or C or G        B       not supported      N
   H        A or C or T        D       not supported      N
   D        A or G or T        H       not supported      N
   B        C or G or T        V       not supported      N
  X/N     G or A or T or C     X            -/X           N
   .    not G or A or T or C   .       not supported      ?


  The frame ambiguity codes used by Staden are not  supported  by  GCG
and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base
equivalent.

     Staden Code          Meaning              GCG

         D                C or CC                c
         V                T or TT                t
         B                A or AA                a
         H                G or GG                g
         K                C or CX                c
         L                T or TX                t
         M                A or AX                a
         N                G or GX                g


                        AMINO ACIDS

  Here is a list of the standard one-letter amino acid codes and their
three-letter  equivalents.   The synonymous codons and their depiction in
the IUB codes are shown.  You should recognize that the codons  following
semicolons  (;)  are  not  sufficiently specific to define a single amino
acid even though they represent the best possible back  translation  into
the IUB codes!  All of the relationships in this list can be redefined by
the user in a local data file described below.

                                                      IUB
Symbol 3-letter  Meaning      Codons                Depiction
 A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX
 B    Asp,Asn   Aspartic,
                Asparagine   GAT,GAC,AAT,AAC         !RAY
 C    Cys       Cysteine     TGT,TGC                 !TGY
 D    Asp       Aspartic     GAT,GAC                 !GAY
 E    Glu       Glutamic     GAA,GAG                 !GAR
 F    Phe     Phenylalanine  TTT,TTC                 !TTY
 G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX
 H    His       Histidine    CAT,CAC                 !CAY
 I    Ile       Isoleucine   ATT,ATC,ATA             !ATH
 K    Lys       Lysine       AAA,AAG                 !AAR
 L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG
!TTR,CTX,YTR;YTX
 M    Met       Methionine   ATG                     !ATG
 N    Asn       Asparagine   AAT,AAC                 !AAY
 P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX
 Q    Gln       Glutamine    CAA,CAG                 !CAR
 R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG
!CGX,AGR,MGR;MGX
 S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
 T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX
 V    Val       Valine       GTT,GTC,GTA,GTG         !GTX
 W    Trp       Tryptophan   TGG                     !TGG
 X    Xxx       Unknown                              !XXX
 Y    Tyr       Tyrosine     TAT, TAC                !TAY
 Z    Glu,Gln   Glutamic,
                Glutamine    GAA,GAG,CAA,CAG         !SAR
 *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR








|||||||||||  docs from PSC on sequence formats:
---------------------------------------------------


          Nucleic Acid and Protein Sequence File Formats


It will probably save you some time if you have your data in a usable
format before you send it to us.  However, we do have the University of
Wisconsin Genetics Computing Group programs running on our VAXen and
this package includes several reformatting utilities.  Our programs
usually recognize any of several standard formats, including GenBank,
EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an
analysis we find the GenBank and EMBL formats most useful, particularly
if you have already received an accession number from one of these
organizations for your sequence.

Our programs do not require that all of the line types available in
GenBank, EMBL, or NBRF file formats be present for the file format to
be recognized and processed.  The following pages outline the essential
details required for correct processing of files by our programs.
Additional information may be present but will generally be ignored.


                      GenBank File Format

File Header

1.  The first line in the file must have "GENETIC SEQUENCE DATA BANK"
    in spaces 20 through 46 (see LINE  1, below).
2.  The next 8 lines may contain arbitrary text.  They are ignored but
    are required to maintain the GenBank format (see LINE 2 - LINE 9).

Sequence Data Entries

3.  Each sequence entry in the file should have the following format.
    a) first line:   Must have LOCUS in the first 5 spaces.  The
                     genetic locus name or identifier must be in spaces
                     13 - 22.  The length of the sequences is right
                     justified in spaces 23 through 29 (see LINE  10).
    b) second line:  Must have DEFINITION in the first 10 spaces.
                     Spaces 13 - 80 are free form text to identify the
                     sequence (see LINE  11).
    c) third line:   Must have ACCESSION in the first 9 spaces.  Spaces
                     13 - 18 must hold the primary accession number
                     (see LINE  12).
    d) fourth line:  Must have ORIGIN in the first 6 spaces.  Nothing
                     else is required on this line, it indicates that
                     the nucleic acid sequence begins on the next line
                     (see LINE  13).
    e) fifth line:   Begins the nucleotide sequence.  The first 9
                     spaces of each sequence line may either be blank
                     or may contain the position in the sequence of the
                     first nucleotide on the line.  The next 66 spaces
                     hold the nucleotide sequence in six blocks of ten
                     nucleotides.  Each of the six blocks begins with a
                     blank space followed by ten nucleotides.  Thus the
                     first nucleotide is in space eleven of the line while
                     the last is in space 75 (see LINE  14, LINE  15).
    f) last line:    Must have // in the first 2 spaces to indicate
                     termination of the sequence (see LINE  16).

NOTE:  Multiple sequences may appear in each file.  To begin another
       sequence go back to a) and start again.


                         Example GenBank file


LINE  1  :                   GENETIC SEQUENCE DATA BANK
LINE  2  :
LINE  3  :
LINE  4  :
LINE  5  :
LINE  6  :
LINE  7  :
LINE  8  :
LINE  9  :
LINE 10  :LOCUS       L_Name     Length BP
LINE 11  :DEFINITION  Describe the sequence any way you want
LINE 12  :ACCESSION   Accession Number
LINE 13  :ORIGIN
LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
LINE 15  :       61 acgt...
LINE 16  ://



                         EMBL File Format

Unlike the GenBank file format the EMBL file format does not require
a series of header lines.  Thus the first line in the file begins
the first sequence entry of the file.

1.  The first line of each sequence entry contains the two letters ID
    in the first two spaces.  This is followed by the EMBL identifier
    in spaces 6 through 14.  (See LINE  1).

2.  The second line of each sequence entry has the two letters AC in
    the first two spaces.  This is followed by the accession number in
    spaces 6 through 11.  (See LINE  2).

3.  The third line of each sequence entry has the two letters DE in the
    first two spaces.  This is followed by a free form text definition
    in spaces 6 through 72.  (See LINE  3).

4.  The fourth line in each sequence entry has the two letters SQ in
    the first two spaces.  This is followed by the length of the
    sequence beginning at or after space 13.  After the sequence length
    there is a blank space and the two letters BP.  (See LINE  4).

5.  The nucleotide sequence begins on the fifth line of the sequence
    entry.  Each line of sequence begins with four blank spaces. The
    next 66 spaces hold the nucleotide sequence in six blocks of ten
    nucleotides.  Each of the six blocks begins with a blank space
    followed by ten nucleotides.  Thus the first nucleotide is in space
    6 of the line while the last is in space 70.  (See LINE  5 -
    LINE  6).

6.  The last line of each sequence entry in the file is a terminator
    line which has the two characters // in the first two spaces.
    (See LINE  7).

7.  Multiple sequences may appear in each file.  To begin another
    sequence go back to item 1 and start again.


                          Example EMBL file

LINE  1  :ID   ID_name
LINE  2  :AC   Accession number
LINE  3  :DE   Describe the sequence any way you want
LINE  4  :SQ          Length BP
LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
LINE  6  :     ACGT...
LINE  7  ://



            NBRF (protein or nucleic acid) File Format

1.  The first line of each sequence entry begins with a greater than
  symbol, >.  This is immediately followed by the two character
  sequence type specifier.  Space four must contain a semi-colon.
  Beginning in space five is the sequence name or identification code
  for the NBRF database.  The code is from four to six letters and
  numbers.  (See LINE  1).

!!!! >> add these to readseq
          Specifier             Sequence type

              P1                protein, complete
              F1                protein, fragment
              DL                DNA, linear
              DC                DNA, circular
              RL                RNA, linear
              RC                RNA, circular
              N1                functional RNA, other than tRNA
              N3                tRNA

2.  The second line of each sequence entry contains two kinds of
  information.  First is the sequence name which is separated from
  the organism or organelle name by the three character sequence
  blank space, dash, blank space, " - ".  There is no special
  character marking the beginning of this line.  (See LINE  2).

3.  Either the amino acid or nucleic acid sequence begins on line three
  and can begin in any space, including the first.  The sequence is
  free format and may be interrupted by blanks for ease of reading.
  Protein sequences man contain special punctuation to indicate
  various indeterminacies in the sequence.  In the NBRF data files
  all lines may be up to 500 characters long.  However some PSC
  programs currently have a limit of 130 characters per line
  (including blanks), and BitNet will not accept lines of over eighty
  characters.  (See LINE  3, LINE  4, and LINE  5).

  The last character in the sequence must be an asterisks, *.

                       Example NBRF file

 LINE  1  :>P1;CBRT
 LINE  2  :Cytochrome b - Rat mitochondrion (SGC1)
 LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S
 LINE  4  : VTHICRDVN Y GWL IRY
 LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*



                MolGen/Stanford File Format

1.  The first line in a sequence file is a comment line.  This line
  begins with a semi-colon in the first space.  This line need
  not be present.  If it is present it holds descriptive text.
  There may be as many comment lines as desired at the first of
  sequence file.  (See LINE  1).

2.  The second line must be present and contains an identifier or
  name for the sequence in the first ten spaces.  (See LINE  2).

3.  The sequence begins on the third line and occupies up to eighty
  spaces.  Spaces may be included in the sequence for ease of
  reading.  The sequence continues for as many line as needed
  and is terminated with a 1 or 2.  1 indicates a linear sequence
  while 2 marks a circular sequence.  (See LINE  3 and LINE  4).

                          Example MolGen/Stanford file

LINE  1  :;  Describe the sequence any way you want
LINE  2  :ECTRNAGLY2
LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT
LINE  4  :  GCTTA   GG G C T A1




|||||||||||  Phylip file format
---------------------------------------------------

        Phylip 3.3 File Format (DNA sequences)


     The input and output formats for PROTPARS and for RESTML are described  in
their  document  files.   In  general  their input formats are similar to those
described here, except that the one-letter codes for data are specific to those
programs  and  are  described in those document files.  Since the input formats
for the eight DNA sequence programs apply to  all  eight,  they  are  described
here.   Their  input  formats are standard: the data have A's, G's, C's and T's
(or U's).  The first line of the input file contains the number of species  and
the  number  of  sites.   As  with  the other programs, options information may
follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line
(described  in  the  document file for these pograms) may follow the first one.
Following this, each species starts on a new line.  The first 10 characters  of
that  line  are the species name.  There then follows the base sequence of that
species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O,
R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
no longer allowed, because it sometimes is used to in aligned sequences to mean
"the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will
numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read
with minimum editing.

     These characters can be  either  upper  or  lower  case.   The  algorithms
convert  all  input  characters  to upper case (which is how they are treated).
The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight
extensions.  They enable input of nucleic acid sequences taking full account of
any ambiguities in the sequence.

The sequences can continue over multiple lines; when this is done the sequences
must  be  either  in  "interleaved"  format, similar to the output of alignment
programs, or "sequential" format.  These are described  in  the  main  document
file.   In sequential format all of one sequence is given, possibly on multiple
lines, before the next starts.  In interleaved format the  first  part  of  the
file  should  contain  the first part of each of the sequences, then possibly a
line containing nothing but a carriage-return character, then the  second  part
of  each  sequence, and so on.  Only the first parts of the sequences should be
preceded by names.  Here is a hypothetical example of interleaved format:

  5    42
Turkey    AAGCTNGGGC ATTTCAGGGT
Salmo gairAAGCCTTGGC AGTGCAGGGT
H. SapiensACCGGTTGGC CGTTCAGGGT
Chimp     AAACCCTTGC CGTTACGCTT
Gorilla   AAACCCTTGC CGGTACGCTT

GAGCCCGGGC AATACAGGGT AT
GAGCCGTGGC CGGGCACGGT AT
ACAGGTTGGC CGTTCAGGGT AA
AAACCGAGGC CGGGACACTC AT
AAACCATTGC CGGTACGCTT AA

while in sequential format the same sequences would be:

  5    42
Turkey    AAGCTNGGGC ATTTCAGGGT
GAGCCCGGGC AATACAGGGT AT
Salmo gairAAGCCTTGGC AGTGCAGGGT
GAGCCGTGGC CGGGCACGGT AT
H. SapiensACCGGTTGGC CGTTCAGGGT
ACAGGTTGGC CGTTCAGGGT AA
Chimp     AAACCCTTGC CGTTACGCTT
AAACCGAGGC CGGGACACTC AT
Gorilla   AAACCCTTGC CGGTACGCTT
AAACCATTGC CGGTACGCTT AA


Note, of course, that a portion of a sequence like this:

   300   AAGCGTGAAC GTTGTACTAA TRCAG

is perfectly legal, assuming that the species name  has  gone  before,  and  is
filled  out  to  full  length  by  blanks.  The above digits and blanks will be
ignored, the sequence being taken as starting at the first base symbol (in this
case an A).

     The present versions of the programs may sometimes have difficulties  with
the  blank  lines  between  groups of lines, and if so you might want to retype
those lines, making sure that they have only a  carriage-return  and  no  blank
characters on them, or you may perhaps have to eliminate them.  The symptoms of
this problem are that the programs complain that the sequences are not properly
aligned, and you can find no other cause for this complaint.

------------------------------------------------


|||||||||||  ASN.1 file format
---------------------------------------------------


ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)

Example asn.1 sequence file----

Bioseq-set ::= {
seq-set {
  seq {
    id { local id 1 } ,                 -- id essential
    descr {  title "Dummy sequence data from nowhere"  } ,  -- optional
    inst {                              -- inst essential
      repr raw ,
      mol dna ,
      length 156 ,
      topology linear ,
      seq-data
        iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
TGGATTCAAAGCAATAGAGTTGTTCTT" 
      } } ,

        seq {
          id { local id 2 } ,
          descr {  title "Dummy sequence 2 data from somewhere else"  } ,
          inst {
                repr raw ,
                mol dna ,
                length 150 ,
                topology linear ,
                seq-data
                  iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
TGGATTCAAAGCAATAGAGTT" 
            }
          }
        }
      }


partial ASN.1 description from toolkit

Bioseq ::= SEQUENCE {
    id SET OF Seq-id ,            -- equivalent identifiers
    descr Seq-descr OPTIONAL , -- descriptors
    inst Seq-inst ,            -- the sequence data
    annot SET OF Seq-annot OPTIONAL }

Seq-inst ::= SEQUENCE {            -- the sequence data itself
    repr ENUMERATED {              -- representation class
        not-set (0) ,              -- empty
        virtual (1) ,              -- no seq data
        raw (2) ,                  -- continuous sequence
        seg (3) ,                  -- segmented sequence
        const (4) ,                -- constructed sequence
        ref (5) ,                  -- reference to another sequence
        consen (6) ,               -- consensus sequence or pattern
        map (7) ,                  -- ordered map (genetic, restriction)
        other (255) } ,
    mol ENUMERATED {               -- molecule class in living organism
        not-set (0) ,              --   > cdna = rna
        dna (1) ,
        rna (2) ,
        aa (3) ,
        na (4) ,                   -- just a nucleic acid
        other (255) } ,
    length INTEGER OPTIONAL ,      -- length of sequence in residues
    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
    topology ENUMERATED {          -- topology of molecule
        not-set (0) ,
        linear (1) ,
        circular (2) ,
        tandem (3) ,               -- some part of tandem repeat
        other (255) } DEFAULT linear ,
    strand ENUMERATED {            -- strandedness in living organism
        not-set (0) ,
        ss (1) ,                   -- single strand
        ds (2) ,                   -- double strand
        mixed (3) ,
        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
    seq-data Seq-data OPTIONAL ,   -- the sequence
    ext Seq-ext OPTIONAL ,         -- extensions for special types
  hist Seq-hist OPTIONAL }       -- sequence history

------------------------------------------------
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Stdfiles'" '(3731 characters)'
if test -f 'Stdfiles'
then
	echo shar: will not over-write existing file "'Stdfiles'"
else
cat << \SHAR_EOF > 'Stdfiles'
/* Stdfiles 
	generate standard files to test readseq
*/

C
#include <stdio.h>
/* no sequence formats use chars > #126, ignore these */
main(void)
{
	int c;
	puts("> alphabet['!'..'~']");
	for (c = '!'; c <= '~'; c++) putc(c,stdout);
	putc('\n', stdout);
}

link  -w  -t MPST -c 'MPS ' c.o  ¶
		"{Libraries}"Interface.o  "{Libraries}"ToolLibs.o ¶
		"{Libraries}"Runtime.o  "{CLibraries}"StdClib.o 
link.out > alphabet.orig


C
#include <stdio.h>
main(void)
{
/* note: symbols "*" and "/" removed as terminators for various formats */
const char *aminos		= "ABCDEFGHIKLMNPQRSTVWXYZ";  
const char *primenuc	= "ACGTU";
const char *allsymbols 	= "_.-?<>{}[]()!@#$%^&=+;:'|`~\"\\";

	char *c, all[256];
	int	count;
	
	strcpy(all, aminos);
	strcat(all, primenuc);
	strcat(all, allsymbols);
	puts("> nucleic/amino test");
	for (count=0; count<4; count++) {
		for (c = all; *c!=0; c++) putc(*c, stdout);
		putc('\n', stdout);
		}
}

link  -w  -t MPST -c 'MPS ' c.o  ¶
		"{Libraries}"Interface.o  "{Libraries}"ToolLibs.o ¶
		"{Libraries}"Runtime.o  "{CLibraries}"StdClib.o 
link.out > nucleic.std

#--------------------------

#standards (ship w/ readseq)
#note: not all alphabet.orig chars are expected to be passed by
#     readseq.  Numbers are dropped.
readseq -p alphabet.orig > alphabet.std
readseq -p -C  alphabet.std > upper.std

cat alphabet.orig
	> alphabet['!'..'~']
	!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~

cat alphabet.std
	>alphabet['!'..'~'], 83 bases, 9429 checksum.
	!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
	^_`abcdefghijklmnopqrstuvwxyz{|}~

cat upper.std
	>alphabet['!'..'~'], 83 bases, 9429 checksum.
	!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
	^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~

cat nucleic.std
	> nucleic/amino test
	ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
	ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
	ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
	ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\

readseq -p nucleic.std
	>nucleic/amino test, 228 bases, 5952 checksum.
	ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;
	:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#
	$%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}
	[]()!@#$%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_
	.-?<>{}[]()!@#$%^&=+;:'|`~"\


#----------------------------------

#test for general read/write of all chars:
readseq -p alphabet.std -otest.alpha
diff test.alpha alphabet.std

#test for valid toupper, general read/write:
readseq -p -C  alphabet.std -otest.upper
diff test.upper upper.std
#for vms, use "-C" to preserve case
# readseq -p "-C"  alphabet.std -otest.upper

#test for multiple sequence file conversions
# leave out gcg, raw; 
# test of long seq conversion ?
# test of mail-header seq conversion ?

#test for valid format conversions
readseq -v -p -f1 nucleic.std -otest.f1
readseq -v -p -f2 test.f1 -otest.f2
readseq -v -p -f3 test.f2 -otest.f3
readseq -v -p -f4 test.f3 -otest.f4
readseq -v -p -f5 test.f4 -otest.f5
readseq -v -p -f6 test.f5 -otest.f6
readseq -v -p -f7 test.f6 -otest.f7
readseq -v -p -f8 test.f7 -otest.f8
readseq -v -p -f1 test.f8 -otest.f1b   
diff test.f1 test.f1b
compare test.f1 test.f1b

readseq -v -p -f13 test.f8 -otest.f13   # raw, drops name
readseq -v -p -f9 test.f8 -otest.f9   	# zuker, little used
#readseq -v -p -f10 test.f9 -otest.f10  # olsen, input only (output=raw)
readseq -v -p -f11 test.f8 -otest.f11	# phylip 3.2, output only
readseq -v -p -f12 test.f8 -otest.f12	# phylip 3.3, output only
readseq -v -p -f14 test.f8 -otest.f14	# phylip 3.4, output only


#clean up
rm test.Å


#-----------------------------
# some general tests

readseq -h 

readseq
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Makefile'" '(3236 characters)'
if test -f 'Makefile'
then
	echo shar: will not over-write existing file "'Makefile'"
else
cat << \SHAR_EOF > 'Makefile'
#
# Unix Makefile for readseq
# to use, command me:
#  %  make       -- or --
#  %  make CC=your-c-compiler-name
#

# pick an ANSI C compiler (the default Sun CC is not ANSI)
CC=gcc  # Gnu C Compiler
#CC=cc  # SGI Irix
#CC=vcc # some DEC Ultrix

CFLAGS=
#CFLAGS= -DSMALLCHECKSUM  # if you prefer to use a GCG-standard 13 bit checksum
#    instead of a full 32 bit checksum. This may enhance compatibility w/ GCG software

SOURCES= readseq.c ureadseq.c ureadseq.h ureadasn.c
DOCS= Readme Readseq.help Formats Stdfiles Makefile Make.com add.gdemenu *.std


# NCBI toolkit support for ASN.1 reader

# this is path to NCBI toolkit, you must set for your system:
NCBI=
#NCBI=/bio/mb/ncbi
#
OTHERLIBS=-lm
LIB1=-lncbi
LIB2=-lncbiobj
LIB3=-lncbicdr
LIB4=-lvibrant
INCPATH=$(NCBI)/include
LIBPATH=$(NCBI)/lib
NCFLAGS=$(CFLAGS) -DNCBI -I$(INCPATH)
NLDFLAGS=-I$(INCPATH) -L$(LIBPATH)
NLIBS=$(LIB1) $(LIB2) $(OTHERLIBS)


all: build test

build: $(SOURCES)
	@echo "Compiling readseq..."
	$(CC) $(CFLAGS) -o readseq readseq.c ureadseq.c

# if using NCBI, uncomment these lines in place of build: above
#build: $(SOURCES)
# @echo "Compiling readseq with NCBI toolkit support...";
# $(CC) -o readseq $(NLDFLAGS) $(NCFLAGS) readseq.c ureadseq.c ureadasn.c $(NLIBS)

test: $(SOURCES) readseq
	@echo ""
	@echo "Test for general read/write of all chars:"
	./readseq -p alphabet.std -otest.alpha
	-diff test.alpha alphabet.std

	@echo ""
	@echo "Test for valid format conversions:"
	./readseq -v -p -f=ig   nucleic.std -otest.ig
	./readseq -v -p -f=gb   test.ig     -otest.gb
	./readseq -v -p -f=nbrf test.gb     -otest.nbrf
	./readseq -v -p -f=embl test.nbrf   -otest.embl
	./readseq -v -p -f=gcg  test.embl   -otest.gcg
	./readseq -v -p -f=strider test.gcg -otest.strider
	./readseq -v -p -f=fitch test.strider -otest.fitch
	./readseq -v -p -f=fasta test.fitch -otest.fasta
	./readseq -v -p -f=pir  test.fasta  -otest.pir
	./readseq -v -p -f=ig   test.pir    -otest.ig-b
	-diff test.ig test.ig-b

	@echo ""
	@echo "Test for multiple-sequence format conversions:"
	./readseq -p -f=ig    multi.std   -otest.m-ig
	./readseq -p -f=gb    test.m-ig   -otest.m-gb
	./readseq -p -f=nbrf  test.m-gb   -otest.m-nbrf
	./readseq -p -f=embl  test.m-nbrf -otest.m-embl
	./readseq -p -f=fasta test.m-embl -otest.m-fasta
	./readseq -p -f=pir   test.m-fasta -otest.m-pir
	./readseq -p -f=msf   test.m-pir  -otest.m-msf
	./readseq -p -f=paup  test.m-msf  -otest.m-paup
	./readseq -p -f=ig    test.m-paup -otest.m-ig-b
	-diff test.m-ig test.m-ig-b
#
# if using NCBI, uncomment these lines
# @echo ""
# @echo "Test of NCBI ASN.1 conversions:"
# ./readseq -p -f=asn test.m-ig  -otest.m-asn
# ./readseq -p -f=ig  test.m-asn -otest.m-ig-c
# -diff test.m-ig test.m-ig-c
#
	@echo ""
	@echo "Expect differences in the header lines due to"
	@echo "different format headers.  If any sequence lines"
	@echo "differ, or if the checksums differ, there is a problem."
	@echo "----------------------"
	@echo ""
	@echo "To clean up test files, command me:"
	@echo "    make clean"


clean:
	rm -f *.o core test.*

shar:
	@echo "shell archiving files..."
	-rm -f readseq*.shar
	mkdir readseqd
	cp $(SOURCES) readseqd
	cp $(DOCS) readseqd
	shar -v readseqd > readseq.shar
	rm -rf readseqd

SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'Make.com'" '(1919 characters)'
if test -f 'Make.com'
then
	echo shar: will not over-write existing file "'Make.com'"
else
cat << \SHAR_EOF > 'Make.com'
$!
$!VAX-VMS cc make file for readseq
$!
$ echo := write sys$output
$ if p1.eqs."TEST" then goto tests
$
$ echo "compiling readseq..."
$ cc readseq, ureadseq
$!
$ echo "linking readseq..."
$ link readseq, ureadseq, sys$library:vaxcrtl/lib
$!
$tests:
$!
$ echo "defining readseq symbol:"
$ dd = f$environment("default")
$ readseq :== $ 'dd'readseq.exe
$ show symbol readseq
$!
$ echo ""
$ echo "test for general read/write of all chars:"
$ readseq -p alphabet.std -otest.alpha
$ diff test.alpha alphabet.std
$!
$ echo ""
$ echo "test for valid format conversions"
$!
$ readseq -v -p -f=ig   nucleic.std -otest.ig
$ readseq -v -p -f=gb   test.ig     -otest.gb
$ readseq -v -p -f=nbrf test.gb     -otest.nbrf
$ readseq -v -p -f=embl test.nbrf   -otest.embl
$ readseq -v -p -f=gcg  test.embl   -otest.gcg
$ readseq -v -p -f=strider test.gcg -otest.strider
$ readseq -v -p -f=fitch test.strider -otest.fitch
$ readseq -v -p -f=fasta test.fitch -otest.fasta
$ readseq -v -p -f=pir  test.fasta  -otest.pir
$ readseq -v -p -f=ig   test.pir    -otest.ig-b
$ diff test.ig test.ig-b
$!
$ echo ""
$ echo "Test for multiple-sequence format conversions:"
$ readseq -p -f=ig    multi.std   -otest.m-ig
$ readseq -p -f=gb    test.m-ig   -otest.m-gb
$ readseq -p -f=nbrf  test.m-gb   -otest.m-nbrf
$ readseq -p -f=embl  test.m-nbrf -otest.m-embl
$ readseq -p -f=fasta test.m-embl -otest.m-fasta
$ readseq -p -f=pir   test.m-fasta -otest.m-pir
$ readseq -p -f=msf   test.m-pir  -otest.m-msf
$ readseq -p -f=paup  test.m-msf  -otest.m-paup
$ readseq -p -f=ig    test.m-paup -otest.m-ig-b
$ diff test.m-ig test.m-ig-b
$ echo ""
$ echo "Expect differences in the header lines due to"
$ echo "different format headers.  If any sequence lines"
$ echo "differ, or if checksums differ, there is a problem."
$!
$! #cleanup
$! delete test.*;
$ echo "-----------"
$ echo ""
$ echo "To clean up test files, command me:
$ echo "  DELETE test.*;"
$!
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'add.gdemenu'" '(2100 characters)'
if test -f 'add.gdemenu'
then
	echo shar: will not over-write existing file "'add.gdemenu'"
else
cat << \SHAR_EOF > 'add.gdemenu'
#
# dgg added new readseq formats, 29 dec 92
#

item:Export Foreign Format
itemmethod:readseq in1 -pipe -all -form=$FORMAT > $OUTPUTFILE
itemhelp:readseq.help

arg:FORMAT
argtype:choice_menu
argchoice:GenBank:genbank
argchoice:IG/Stanford:ig
argchoice:NBRF:nbrf
argchoice:EMBL:embl
argchoice:GCG:gcg
argchoice:DNA Strider:strider
argchoice:Fitch:fitch
argchoice:Pearson/Fasta:pearson
argchoice:Zuker:zuker
argchoice:Olsen:olsen
argchoice:Phylip:phylip
#argchoice:Phylip v3.2:phylip3.2
argchoice:Plain text:raw
argchoice:ASN.1:asn
argchoice:PIR:pir
argchoice:MSF:msf
argchoice:PAUP:paup
argchoice:Pretty:pretty -nametop -nameleft=3 -numright -nameright -numtop

arg:OUTPUTFILE
argtype:text
arglabel:Save as?

in:in1
informat:genbank


#
#dgg addition for new readseq, 24 dec 92
#

item:Pretty Print
itemmethod:readseq in1 -p -a -f=pretty $NAMELEFT $NAMERIGHT $NUMTOP $NUMBOT $NUMLEFT $NUMRIGHT -col=$COLS -width=$WIDTH $MATCH $GAPC > in1.pretty; (textedit in1.pretty; /bin/rm -f in1 in1.pretty)&
itemhelp:readseq.help

#nametop is bad !?

in:in1
informat:genbank

arg:NAMETOP
argtype:chooser
arglabel:Names at top  ?
argchoice:No:
argchoice:Yes:-nametop

arg:NAMELEFT
argtype:chooser
arglabel:Names at left ?
argchoice:No:
argchoice:Yes:-nameleft

arg:NAMERIGHT
argtype:chooser
arglabel:Names at right?
argchoice:Yes:-nameright
argchoice:No:

arg:NUMTOP
argtype:chooser
arglabel:Numbers at top  ?
argchoice:Yes:-numtop
argchoice:No:

arg:NUMBOT
argtype:chooser
arglabel:Numbers at tail ?
argchoice:No:
argchoice:Yes:-numbot

arg:NUMLEFT
argtype:chooser
arglabel:Numbers at left ?
argchoice:Yes:-numleft
argchoice:No:

arg:NUMRIGHT
argtype:chooser
arglabel:Numbers at right?
argchoice:Yes:-numright
argchoice:No:

arg:MATCH
argtype:chooser
arglabel:Use match '.' for 2..n species?
argchoice:No:
argchoice:Yes:-match

arg:GAPC
argtype:chooser
arglabel:Count gap symbols?
argchoice:No:
argchoice:Yes:-gap

arg:WIDTH
argtype:slider
arglabel:Sequence width?
argmin:10
argmax:200
argvalue:50

arg:COLS
argtype:slider
arglabel:Column spacers?
argmin:0
argmax:50
argvalue:10


### pretty print insert end
#


SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'alphabet.std'" '(131 characters)'
if test -f 'alphabet.std'
then
	echo shar: will not over-write existing file "'alphabet.std'"
else
cat << \SHAR_EOF > 'alphabet.std'
>alphabet['!'..'~'], 83 bases, 9429 checksum.
!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
^_`abcdefghijklmnopqrstuvwxyz{|}~
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'multi.std'" '(1180 characters)'
if test -f 'multi.std'
then
	echo shar: will not over-write existing file "'multi.std'"
else
cat << \SHAR_EOF > 'multi.std'
>acarr58sst      , 183 bases, 6EF8F222 checksum.
a-------actcctaacaacGgAtatCTtGgtT-CtcgcgagGatGAaGa
acGcAGcg--AaatGcGatacgtagtgtgaatcgc-agggatcagtgaat
catcgaatctttgaacgcaagttgcgctctcgtg--gtttaaccccccgg
gagc-acgttcgcttgagtgcc--gctt-----
>amgrrbst        , 183 bases, 250C5F09 checksum.
----cgt--ccccgaacggcGgAtcaCTtGgcT-CgtggatcGatGAaGa
ccGcAGc--tAtctGcGcgtcgtcgtgtaatccgc-aggttatac--gaa
catcgaccagtcgaacgcacattgcggcctcggt--gacccgcgggcccc
gggccacgcctgtctgagggtc--gt------a
>bmorrbst        , 183 bases, AF379101 checksum.
aaatgattaccctggacggtGgAtcaCTtGgcT-CgcgggtcGatGAaGa
acGcAGt--tAactGcGcgtcatagtgtgaactgc-aggacacatttgaa
catcgacatttcgaacgcacattgcggtc-cgtg--gagacaca--tcca
ggaccactcctgtctgagggcc--ggct-----
>crerrbst        , 183 bases, AD1ECB91 checksum.
a-------actctcaacaacGgAtatCTtGgcT-CtcggatcGatGAaGg
acGcAGcg--AaatGcGatacgtagtgtgaactgc-agaaatacgtgaac
tatcgaatccctgaacgtatactgcgccc--gag--gcc---ccggt--a
gagc-atgtctgccttagtgct--gggt----t
>ddirr58sst      , 183 bases, 783AEF3F checksum.
t-----taagcataaacggtGaAtacCTcGacTcC-caaattGatGAaGa
ccGtAGca--AactGcGataattcacttgaattgc-agcctactg-ggat
agttgaaatgttgaacgcacatgatgacatcggt---cctttcggattag
gtgttatacttgggtgagagt--------ggtc
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'nucleic.std'" '(253 characters)'
if test -f 'nucleic.std'
then
	echo shar: will not over-write existing file "'nucleic.std'"
else
cat << \SHAR_EOF > 'nucleic.std'
> nucleic/amino test
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'upper.std'" '(131 characters)'
if test -f 'upper.std'
then
	echo shar: will not over-write existing file "'upper.std'"
else
cat << \SHAR_EOF > 'upper.std'
>alphabet['!'..'~'], 83 bases, 9429 checksum.
!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~
SHAR_EOF
fi # end of overwriting check
echo shar: done with directory "'readseqd'"
cd ..
#	End of shell archive
exit 0
