#!/usr/bin/perl =head1 DESCRIPTION genoblastformat.pl split and shred genome fasta -- max chunk size (e.g. 50kb .. 1,000 kb are good ranges) -- overlap size (e.g. 50,000 bases to prevent splitting genes) -- randomize or flatten size distribution in file -- blast formatdb =head1 AUTHOR Don Gilbert, gilbertd@indiana.edu, 2005/2006. for drosophila species and daphnia genome blast annotations =cut use strict; use Getopt::Long; my $doprotformat=0; my $dodnaformat=0; my $doaaformat=0; my $doshredchr= 0; my $debug= 1; my($shredsize,$shredoverlap)= (40000,20000); my $opts=" -p T -o T "; # -v 4000 is bigger than any protdb ; protdb sizes fit in mem < 1GB my $formatdb="/bio/bio-grid/ncbi-20050429/ncbi/bin/formatdb"; my $webnaopts= " -p F -o F "; my $webaaopts= " -p T -o F "; my $naopts= " -p F -o T "; my $mpiopts=" --frag-size 4000000000 -p T -o F "; my $mpiformatdb="/bio/bio-grid/blast/bin/mpiformatdb"; my ($fadir,$dbdir)=("protsrc/","protdb/"); my $blid= 'noname'; #?? my $dbid= ''; # ones with good MOD gene, chr info my @protXchrQuery=qw( modSC modCE modDM ensHS modMM ensDR ensAG ncbAT modOG ); # modRR modDD ensAG ? mosq. my %proteomesf=( ensAG => "Anopheles_gambiae.MOZ2a.dec.pep.fa.gz", ensAM => "Apis_mellifera.AMEL1.1.dec.pep.fa.gz", ncbAT => "Arabidopsis_thaliana_NC_003070-76.fa.gz", modCB => "Caenorhabditis-briggsae_WormBase_brigpep2_protein-reps.fa.gz", modCE => "Caenorhabditis-elegans_WormBase_WS130_protein-reps.fa.gz", ensCF => "Canis_familiaris.BROADD1.dec.pep.fa.gz", ensDR => "Danio_rerio.ZFISH4.dec.pep.fa.gz", modDD => "Dictyostelium-discoideum_dictyBase_01172005_protein-reps.fa.gz", modDM => "Drosophila-melanogaster_FlyBase_r4.0_protein-reps.fa.gz", modDP => "Drosophila-pseudoobscura_FlyBase_r1.03_protein-reps.fa.gz", ensFR => "Fugu_rubripes.FUGU2.dec.pep.fa.gz", ensGG => "Gallus_gallus.WASHUC1.dec.pep.fa.gz", ensHS => "Homo_sapiens.NCBI35.dec.pep.fa.gz", modMM => "Mus-musculus_MGI_01282005_protein-reps.fa.gz", modOG => "Oryza_Gramene_r16.0_protein-reps.fa.gz", ensPT => "Pan_troglodytes.CHIMP1.dec.pep.fa.gz", modRR => "Rattus-norvegicus_RGD_version_protein-reps.fa.gz", modSC => "Saccharomyces-cerevisiae_SGD_08272004_protein-reps.fa.gz", ## add 4 more from inparanoid collection sanPF => "Plasmodium_falciparum_Pfa3D7.fa.gz", modSP => "Schizosaccharomyces-pombe_GeneDB_Spombe_02042005_protein-reps.fa.gz", ensTN => "Tetraodon_nigroviridis.TETRAODON7.dec.pep.fa.gz", ncbEC => "Escherichia_coli_K12_NC_000913.fa.gz", ); # 22 #Escherichia_coli_K12_NC_000913.fa.gz #Plasmodium_falciparum_Pfa3D7.fa.gz human malaria protozoan parasite #Schizosaccharomyces-pombe_GeneDB_Spombe_02042005_protein-reps.fa.gz #Tetraodon_nigroviridis.TETRAODON7.dec.pep.fa.gz freshwater pufferfish my @protkeys= sort keys %proteomesf; my ($input,$output,$flattenfastaorder); my $ok= Getopt::Long::GetOptions( 'chromosome!' => \$doshredchr, 'proteomes!' => \$doprotformat, 'dnablast!' => \$dodnaformat, 'aablast!' => \$doaaformat, 'flattenfasta!' => \$flattenfastaorder, 'debug!' => \$debug, 'input=s' => \$input, 'output=s' => \$output, #'format!' => \$doformat, 'fastadir=s' => \$fadir, 'dbdir=s' => \$dbdir, 'blid=s' => \$blid, 'shredsize=s' => \$shredsize, 'shredoverlap=s' => \$shredoverlap, ); die <gi.\\d+.ref\\|(\\S+)/>$blid|\$1 \$1/){s/>(\\S+)/>$blid|\$1 \$1/;}' |\\ $formatdb -n $dbid $bopts -i stdin "); } if ($doprotformat) { perOrgFormat($fadir,$dbdir,\%proteomesf); } #--------------------- sub perOrgFormat { my($fadir,$dbdir,$dbset)= @_; my ($dbid,$fa); while ( ($dbid,$fa)= each(%$dbset) ) { (my $title=$fa) =~ s/.gz$//; if (-e "$dbdir/$dbid.phr") { warn "already done $dbdir/$dbid\n"; next; } warn "formatdb -n $dbid -t $title $opts \n" if $debug; #system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' > $dbid.fa"); system("gunzip -c $fadir$fa | \\ perl -pe'unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1 \$1/){s/>(\\S+)/>gnl|$dbid|\$1 \$1/;}' \\ > $dbdir$dbid.fa"); system("cd $dbdir; $formatdb -n $dbid -t $title $opts -i $dbid.fa "); } ## fix this - bad for blast -o T ## >gnl|ncbAT|gi|15223276|ref|NP_171609.1| ## >gnl|ncbAT|NP_171609.1| # unless(s/>gi\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;} # unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;} # system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' | $formatdb -n $dbid -t $title $opts -i stdin"); } sub makeFastaDb { my($fadir,$dbdir,$dbset)= @_; my ($dbid,$fa); while ( ($dbid,$fa)= each(%$dbset) ) { # system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' > $dbid.fa"); system("gunzip -c $fadir$fa | \\ perl -pe'unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;}' \\ > $dbdir$dbid.fa"); } } sub putShred { my($TO,$id,$def,$seq,$psize,$poverlap)= @_; my ($start,$total,$i)= (0,0); my $len= length($seq); for($i=1; $start < $len; $i++) { my $pseq; if ($len - $start < $psize+$poverlap) { if ($start == 0) { $pseq= $seq; } else { $pseq= substr($seq,$start); } $poverlap= $len - $start + 1; } else { $pseq= substr($seq,$start,$psize); } my $nb = length($pseq); my ($istart,$stop)= ($start+1, $start + $nb); #my $pdef= "start_${i}=$istart; len_${i}=$nb; overlap=$poverlap; $def"; # messy field names! my $pdef= "pt=$i; pt_start=$istart; pt_len=$nb; overlap=$poverlap; $def"; my $pid = "${id}_${istart}_${stop}"; #? id = chr?; use Chr:start-stop format ? $pseq =~ s/(.{1,50})/$1\n/g; print $TO ">$pid $pdef\n",$pseq,"\n"; $start += $poverlap; $total += $nb; # has overlapped } $i--; warn "shredded: $id, len=$len to $i parts (size=$psize, totlen=$total)\n" if $debug; } # >2L type=chromosome; loc=2L:1..22407834; ID=2L; release=r4.1; species=dmel sub shredChromosome { my($fafile,$psize,$poverlap,$dbid)= @_; $psize= 40000 unless($psize>0); $poverlap= int($psize/2) unless($poverlap>0); (my $to= $fafile) =~ s/\.fa\w*(.gz)?$//; $to= substr($to,1+rindex($to,'/')) if ($to =~ m,/,); ##$to =~ s,.*/([^/]+)$,$1,; $to .="-shredded-$psize-$poverlap.fa"; if ($fafile =~/.gz$/){ open(F,"gunzip -c $fafile|") or die" open $fafile"; } else { open(F,$fafile) or die" open $fafile"; } open(TO,">$to") or die "$to"; my($id,$seq,$def); while() { chomp; if (/^>(\S+)\s*(.*)$/){ my($a,$b)=($1,$2); if ($id && $seq) { putShred(*TO,$id,$def,$seq,$psize,$poverlap); } $id=$a; $def=$b; $seq=''; $id="$dbid|$id" if $dbid; } elsif (/^\S/) { $seq .= $_; } } putShred(*TO,$id,$def,$seq,$psize,$poverlap) if ($id && $seq); close(F); close(TO); return $to; } =item homogenizeFastaSizeOrder for parallel-blast, put fasta lib into order where entry sizes are homogenized, so split by count wont end up with a few par. parts that take way longer than others need two passes; 1 = count fa lengths; 2 = rewrite in better order =cut sub homogenizeFastaSizeOrder { my($infile,$outfile)= @_; my($inh,$outh); if (!$infile || $infile =~ m/^(stdin|\-)$/i) { ## no good, need file for randacc# $inh= *STDIN; die "Input file required for homogenize fasta sizes"; } else { unless (open(R,$infile)) { warn "cannot read $infile"; return undef; } $inh= *R; } warn "\nparsing $infile to $outfile\n" if $debug; if (!$outfile || $outfile =~ m/^(stdout|\-)$/i) { $outh= *STDOUT; } else { unless (open(O,">$outfile")) { warn "cannot write $outfile"; return undef; } $outh= *O; } # 1.count lens my %locs=(); my %szbins=(); my($start, $at, $id, $size, $nentry, $totsize)= (0,0,undef,0,0,0); while(<$inh>) { if(/^>(\S+)/) { my $id1= $1; if ($size) { my $szbin= 500 * int( $size / 500); $szbins{$szbin}++; $totsize += $size; } $locs{$id}= [$id, $start, $at - $start, $size] if ($id); $id= $id1; $start= $at; $nentry++; $size= 0; } else{ my $nc= tr/A-z/A-z/; $size += $nc; } $at= tell($inh); } $locs{$id}= [$id, $start, $at - $start, $size] if ($id); # last # close($inh); open(R,$infile); $inh= *R; # ?need special open for randome access? # 2. reorder by sizes - how? # ? assume lots, try just randomize ids ? my $avesize= int( $totsize / $nentry); my @ids= keys %locs; my @sids= sort{ rand() <=> rand() } @ids; warn "read $nentry records; ave size=$avesize\n" if $debug; my %szbinout=(); my $nout= 0; my $buf=''; foreach my $lc (@sids) { my( $id, $start, $flen, $size)= @{$locs{$lc}}; $buf=''; seek($inh,$start,0); read($inh,$buf,$flen); print $outh $buf; $nout++; my $nbin= int(30 * $nout / $nentry); $szbinout{$nbin} += $size; } close($outh); close($inh); if ($debug) { print STDERR "output size distribution:\n"; foreach my $bn (sort{$a <=> $b} keys %szbinout) { print STDERR $bn,"=",$szbinout{$bn}," "; } print STDERR "\n"; } } __END__