#!/usr/local/bin/perl -w ########################################################################### # This script takes a sequence (in FASTA or RAW format) # BLASTS the sequence against the DBEST database # Returns the top 10 results that have an E Value better than .0001 # It then looks up the accession numbers to the BLAST Results # against the UNIGENE Data file # It returns the best hit # If there are no hits it returns nothing # If there were more than one Unigene Cluster returned from the top 10 est # hits a * is tacked on to the best hit. ########################################################################### # Script requires the Hs.data file from Unigene # ftp://ftp.ncbi.nlm.nih.gov/repository/UniGene/ # Script also requires NHGRI::Blastall # ftp://ftp.nhgri.nih.gov/pub/software/blastall/ ########################################################################### use NHGRI::Blastall; use strict; use vars qw($VERSION $HS_DATA_FILE %UNIGENE_HASH %HITS); $VERSION = 0.01; $HS_DATA_FILE = './Hs.data'; %UNIGENE_HASH = (); %HITS = (); MAIN: { my $seq_file = $ARGV[0] || die "usage: $0 SEQFILE\n"; my $b = new NHGRI::Blastall; $b->blastall( {'p' => 'blastn', 'a' => 4, 'e' => '.0001', 'b' => 10, 'd' => 'est', 'i' => $seq_file } ); my @ids = $b->result('id'); read_unigene_into_memory(); foreach my $id (@ids) { $id =~ m/^[^\|]*\|([^\|]*)\|/; my $accession = $1; $HITS{$UNIGENE_HASH{$accession}}++ if ($UNIGENE_HASH{$accession}); } print_stats(); } sub print_stats { my @hits = sort {$HITS{$b} <=> $HITS{$a} } keys %HITS; if (@hits > 1) { print "$hits[0]*\n"; } elsif (@hits == 1) { print "$hits[0]\n"; } } sub read_unigene_into_memory { my $current_cluster_id; open IN, $HS_DATA_FILE or die "cannot open $HS_DATA_FILE:$!"; while () { chomp; if (/^ID\s+(.*)$/) { $current_cluster_id = $1; } elsif (/^SEQUENCE\s+ACC=(\S+);/) { $UNIGENE_HASH{$1} = $current_cluster_id; } } }