From: SMTP%"clark@mshri.utoronto.ca" 12-MAR-1990 17:36 To: GILBERTD Subj: FAMAIL shell update Date: Mon, 12 Mar 90 17:28:19 EST Message-Id: <9003122228.AA14079@lash.utcs.utoronto.ca> From: clark@mshri.utoronto.ca To: @mail-shells.dis@lash.utcs.utoronto.ca Subject: FAMAIL shell update Fellow GCGer, Here is the modified version of the DCL shell FAMAIL. It has been updated to allow searching of the new GenBank database GenPept, which is a translation of the protein-coding regions of the regular GenBank sequences. I have also changed the shell to make it impossible to search a protein sequence against a DNA database, and vice versa. One of our post-docs tried searching a peptide against GenBank and got back at least 200 pages of junk. I hope this modification will cut down the net traffic a bit. As always, let me know if you discover any bugs. Stephen Clark, Ph.D. Computer Resources Manager Samuel Lunenfeld Research Institute Mount Sinai Hospital 600 University Avenue Toronto, Ontario Canada M5G 1X5 clark@mshri.utoronto.ca (Internet) sinai@utoroci (Netnorth/Bitnet) ************** $ ! FAMAIL.COM $ $ ! March 12, 1990 $ $ ! Written by Steve Clark $ ! Mt. Sinai Hospital Research Institute, Toronto, Canada $ $ ! Command procedure to send a sequence to GenBank to have a FASTA $ ! search performed on it. THis procedure asks all the relevant $ ! questions, constructs a text file with the sequence in Intelligenetics $ ! format, and mails it to GenBank. It accepts the name of the query $ ! sequence on the command line as P1. $ ! Note: the symbol SEARCH_ADDRESS is the network address for the Genbank $ ! Search service. This will have to be changed to accomodate local $ ! gateways, etc. $ $ on control_y then goto terminate $ bell[0,7] = 7 $ ws := "write sys$output" $ iq := inquire/nopunctuation $ $ ! The Internet address for sending the search file is $ ! SEARCH@GENBANK.BIO.NET $ $ search_address := "lash::genbank.bio.net::search" $ $ ws "" $ ws "This procedure initiates a FASTA search for similarity between" $ ws "your query sequence and one of the databases maintained by GenBank." $ ws "The information required for executing the search is sent to" $ ws "GenBank via electronic mail and is executed by the GenBank people" $ ws "themselves. Their databases are much more current than our local" $ ws "ones, and their computer is very fast. You must specify which" $ ws "strand to search, since GenBank only searches ONE of the strands" $ ws "of your sequence. The results of the search will be returned to" $ ws "you via e-mail." $ ws "" $ ws "Please remember that if you submit a second search against ALL of" $ ws "GenBank or EMBL while a previous search is still waiting to be" $ ws "executed, the previous search will be aborted." $ $ seqname := "''p1'" $ if(seqname.NES."") then goto no_query $ $get_query: $ $ ws "" $ iq seqname "GenBank FASTA with what query sequence? " $ if(seqname.EQS."") then goto get_query $ $no_query: $ $ ! See if the sequence exists $ $ assign/user_mode nl: sys$output $ seqinfo/infile='seqname' $ if(seqinfotype.NES."NONE") then goto check_gcg $ ws "" $ ws "''bell'''seqname' doesn't exist. Please try again." $ goto get_query $ $check_gcg: $ $ ! Check if the sequence is in GCG format. $ $ if(seqinfotype.NES."NOGCG") then goto get_start $ ws "" $ ws "''bell'''seqname' is not a legitimate GCG sequence file!" $ ws "" $ ws "Select option by number -" $ ws "" $ ws "1) Specify another sequence" $ ws "2) Quit" $ ws "" $ iq choice "Choice (* 1 *) ? " $ if(choice.EQS."2") then exit $ goto get_query $ $get_start: $ $ ws "" $ iq begin "Begin (* 1 *) ? " $ if (begin.EQS."") then begin := 1 $ ibegin = f$integer(begin) $ if((ibegin.GE.1).AND.(ibegin.LT.f$integer(seqinfolength))) then - goto get_end $ ws "''bell'The start must be between 1 and ''seqinfolength'." $ goto get_start $ $get_end: $ $ iq end "End (* ''seqinfolength' *) ? " $ if (end.EQS."") then end := 'seqinfolength' $ iend = f$integer(end) $ if((iend.GT.ibegin).AND.(iend.LE.f$integer(seqinfolength))) then - goto get_reverse $ ws "''bell'The start must be between 1 and ''seqinfolength'." $ goto get_end $ $get_reverse: $ $ iq reverse "Reverse (* No *)? " $ if(reverse.EQS."") then reverse := NO $ if(f$extract(0,1,reverse).EQS."Y") then reverse := "YES" $ if(f$extract(0,1,reverse).EQS."N") then reverse := "NO" $ if(reverse.EQS."YES") then goto get_database $ if(reverse.EQS."NO") then goto get_database $ ws "''bell'Please answer Yes or No." $ goto get_reverse $ $get_database: $ $ ! Find out which database to search. If the sequence is DNA, the $ ! default is the Genbank database. The default for proteins is $ ! the SWISS-PROT database $ $ ws "" $ ws "Database to search:" $ ws "" $ if(seqinfotype.EQS."PROTEIN") then goto get_pepdatabase $ ws " 1) ALL of GenBank" $ ws " 2) GenBank Primate sequences" $ ws " 3) GenBank Rodent sequences" $ ws " 4) Other GenBank Mammalian sequences" $ ws " 5) Other GenBank Vertebrate sequences" $ ws " 6) GenBank Invertebrate sequences" $ ws " 7) GenBank Plant sequences" $ ws " 8) GenBank Bacterial sequences" $ ws " 9) GenBank Organelle sequences" $ ws "10) GenBank Phage sequences" $ ws "11) GenBank Viral sequences" $ ws "12) GenBank Structural RNA sequences" $ ws "13) GenBank Synthetic sequences" $ ws "14) GenBank Unannotated sequences" $ ws "15) New GenBank sequences since the last quarterly release" $ ws "16) ALL of EMBL" $ ws "17) New EMBL sequences since the last release $ ws "" $ iq choice "Please enter choice (* 1 *): " $ if(choice.EQS."") then choice := 1 $ database := "" $ if(choice.EQS."1") then database := GENBANK/ALL $ if(choice.EQS."2") then database := GENBANK/PRIMATE $ if(choice.EQS."3") then database := GENBANK/RODENT $ if(choice.EQS."4") then database := GENBANK/OTHER_MAMMALIAN $ if(choice.EQS."5") then database := GENBANK/OTHER_VERTEBRATE $ if(choice.EQS."6") then database := GENBANK/INVERTEBRATE $ if(choice.EQS."7") then database := GENBANK/PLANT $ if(choice.EQS."8") then database := GENBANK/BACTERIAL $ if(choice.EQS."9") then database := GENBANK/ORGANELLE $ if(choice.EQS."10") then database := GENBANK/PHAGE $ if(choice.EQS."11") then database := GENBANK/VIRAL $ if(choice.EQS."12") then database := GENBANK/STRUCTURAL_RNA $ if(choice.EQS."13") then database := GENBANK/SYNTHETIC $ if(choice.EQS."14") then database := GENBANK/UNANNOTATED $ if(choice.EQS."15") then database := GENBANK/NEW $ if(choice.EQS."16") then database := EMBL/ALL $ if(choice.EQS."17") then database := EMBL/NEW $ if(database.NES."") then goto get_wordsize $ ws "''bell'Valid responses are 1 - 17, inclusive." $ goto get_database $ $get_pepdatabase: $ $ ws "1) ALL of SWISS-PROT" $ ws "2) ALL of the translated GenBank sequences" $ ws "3) New translated GenBank sequences since last quarterly release" $ ws "" $ iq choice "Please enter choice (* 1 *): " $ if(choice.EQS."") then choice := 1 $ database = "" $ if(choice.EQS."1") then database := SWISS-PROT/ALL $ if(choice.EQS."2") then database := GENPEPT/ALL $ if(choice.EQS."3") then database := GENPEPT/NEW $ if(database.NES."") then goto get_wordsize $ ws "''bell'Valid responses are 1 - 3, inclusive." $ goto get_database $ $get_wordsize: $ $ ! Find out how long the word should be $ $ if(seqinfotype.EQS."PROTEIN") then goto get_pwordsize $ $ ws "" $ iq wordsize "What word size (* 4 *)? " $ if(wordsize.EQS."") then wordsize := 4 $ if((f$integer(wordsize).GT.2).AND.(f$integer(wordsize).LT.7)) - then goto get_nscores $ ws "''bell'Word size must be in the range from 3 to 6." $ goto get_wordsize $ $get_pwordsize: $ $ ws "" $ iq wordsize "What word size (* 1 *)? " $ if(wordsize.EQS."") then wordsize := 1 $ if((f$integer(wordsize).GT.0).AND.(f$integer(wordsize).LT.3)) - then goto get_nscores $ ws "''bell'Word size must be in the range from 1 to 2." $ goto get_pwordsize $ $get_nscores: $ $ ws "" $ iq nscores "List how many best scores (* 100 *)? " $ if(nscores.EQS."") then nscores := 100 $ if(f$integer(nscores).GE.10) then goto get_nalign $ ws "''bell'At least 10 scores should be listed." $ goto get_nscores $ $get_nalign: $ $ ws "" $ iq nalign "Align how many matches (* 20 *)? " $ if(nalign.EQS."") then nalign := 20 $ if(nalign.GT.4) then goto summarize $ ws "''bell'At least 5 alignments should be shown." $ goto get_nalign $ $summarize: $ $ ws "" $ ws "" $ ws "The following FASTA search will be executed:" $ ws "" $ if(reverse.EQS."NO") then ws "Query sequence: ''seqname' from ", - "''begin' to ''end' (''seqinfotype')" $ if(reverse.EQS."YES") then ws "Query sequence: Reverse of ''seqname'", - " from ''begin' to ''end' (''seqinfotype')" $ ws "Database to be searched: ''database'" $ ws "Word size: ''wordsize'" $ ws "Scores to list: ''nscores'" $ ws "Alignments to show: ''nalign'" $ ws "" $ iq choice "Are these parameters correct (* Yes *)? " $ choice = f$extract(0, 1, choice) $ if(choice.EQS."") then goto do_it $ if(choice.EQS."Y") then goto do_it $ $ ! Something is wrong. Give the chance to correct it, or give up. $ $ask_repeat: $ $ ws "" $ ws "Do you want to" $ ws "" $ ws "1) Try again" $ ws "2) Give up" $ ws "" $ iq choice "Please enter the number of your choice (* 1 *): " $ if(choice.eqs."") then goto get_query $ if(choice.eqs."1") then goto get_query $ if(choice.eqs."2") then exit $ ws "''bell'Wasn't the question simple enough for you?" $ goto ask_repeat $ $do_it: $ $ ! Determine the root name of the sequence for comparison. This in used $ ! in specifying the output file name, which has the extension .FAM. $ $ ! Check if the sequence is a file. If not, assume it is a database $ ! entry and get the locus name to use as a root. $ $ seqroot = seqname $ root = f$parse(seqroot,,,"NAME") ! root name of sequence $ if(root.NES."") then goto set_outname $ $ ! Remove database name $ $ pos = 'f$locate(":", seqroot)' $ len = 'f$length(seqroot)' $ if(pos.NE.len) then root = f$extract(pos+1, len-pos, seqroot) $ $ set_outname: $ $ outname := "''root'.FAM" $ $ ! Convert the sequence to Intelligenetics format. Since ToIG does not $ ! allow command line input, need to construct a little command $ ! procedure to substitute in the appropriate sequence names. $ $ ws "" $ ws "Converting ''seqname' to IntelliGenetics format..." $ ws "" $ $ open/write comfile fam.cmd $ wc := "write comfile" $ wc "$ set noon" $ wc "$ set verify" $ wc "ToIG" $ wc "''seqname'" $ wc "''begin'" $ wc "''end'" $ wc "''reverse'" ! Reverse? $ wc "''root'.IG" ! Output file name $ wc "$ set noverify" $ close comfile $ @fam.cmd $ del fam.cmd;0 $ $ ! Write the text file that will be mailed to GenBank $ $ ws "Creating the file to be mailed to GenBank..." $ $ open/write comfile 'outname' $ wc "DATALIB ''database'" $ wc "KTUP ''wordsize'" $ wc "SCORES ''nscores'" $ wc "ALIGNMENTS ''nalign'" $ wc "BEGIN" $ $ ! The IG format files that GCG makes starts with a blank line, which $ ! chokes the GenBank FASTA program. Therefore it is not possible to $ ! just append the sequence file to the text file. Read it line by $ ! line, copying over only those records that aren't zero length. (At $ ! the present time there is just one blank line - the first one - but $ ! who knows what the future has in store for us?) $ $ open/read igfile 'root'.IG $ $read_loop: $ $ read/end_of_file=eof igfile record $ if(f$length(record).EQ.0) then goto read_loop $ write comfile record $ $ goto read_loop $ $eof: $ $ close igfile $ close comfile $ delete 'root'.IG;0 $ $ ! Mail the file away. $ $ ws "Mailing the file to GenBank..." $ $ mail 'outname' 'search_address' $ $ ws "The file ''outname' has been sent to GenBank." $ ws "The results will be mailed back to you shortly." $ ws "" $ $ delete 'outname';0 $ $ terminate: ! Jump here on ^y. Don't do any cleanup $ $ exit