#!/usr/bin/perl
=head1 DESCRIPTION
genoblastformat.pl
split and shred genome fasta
-- max chunk size (e.g. 50kb .. 1,000 kb are good ranges)
-- overlap size (e.g. 50,000 bases to prevent splitting genes)
-- randomize or flatten size distribution in file
-- blast formatdb
=head1 AUTHOR
Don Gilbert, gilbertd@indiana.edu, 2005/2006.
for drosophila species and daphnia genome blast annotations
=cut
use strict;
use Getopt::Long;
my $doprotformat=0;
my $dodnaformat=0;
my $doaaformat=0;
my $doshredchr= 0;
my $debug= 1;
my($shredsize,$shredoverlap)= (40000,20000);
my $opts=" -p T -o T "; # -v 4000 is bigger than any protdb ; protdb sizes fit in mem < 1GB
my $formatdb="/bio/bio-grid/ncbi-20050429/ncbi/bin/formatdb";
my $webnaopts= " -p F -o F ";
my $webaaopts= " -p T -o F ";
my $naopts= " -p F -o T ";
my $mpiopts=" --frag-size 4000000000 -p T -o F ";
my $mpiformatdb="/bio/bio-grid/blast/bin/mpiformatdb";
my ($fadir,$dbdir)=("protsrc/","protdb/");
my $blid= 'noname'; #??
my $dbid= '';
# ones with good MOD gene, chr info
my @protXchrQuery=qw(
modSC modCE modDM ensHS modMM ensDR ensAG ncbAT modOG
);
# modRR modDD ensAG ? mosq.
my %proteomesf=(
ensAG => "Anopheles_gambiae.MOZ2a.dec.pep.fa.gz",
ensAM => "Apis_mellifera.AMEL1.1.dec.pep.fa.gz",
ncbAT => "Arabidopsis_thaliana_NC_003070-76.fa.gz",
modCB => "Caenorhabditis-briggsae_WormBase_brigpep2_protein-reps.fa.gz",
modCE => "Caenorhabditis-elegans_WormBase_WS130_protein-reps.fa.gz",
ensCF => "Canis_familiaris.BROADD1.dec.pep.fa.gz",
ensDR => "Danio_rerio.ZFISH4.dec.pep.fa.gz",
modDD => "Dictyostelium-discoideum_dictyBase_01172005_protein-reps.fa.gz",
modDM => "Drosophila-melanogaster_FlyBase_r4.0_protein-reps.fa.gz",
modDP => "Drosophila-pseudoobscura_FlyBase_r1.03_protein-reps.fa.gz",
ensFR => "Fugu_rubripes.FUGU2.dec.pep.fa.gz",
ensGG => "Gallus_gallus.WASHUC1.dec.pep.fa.gz",
ensHS => "Homo_sapiens.NCBI35.dec.pep.fa.gz",
modMM => "Mus-musculus_MGI_01282005_protein-reps.fa.gz",
modOG => "Oryza_Gramene_r16.0_protein-reps.fa.gz",
ensPT => "Pan_troglodytes.CHIMP1.dec.pep.fa.gz",
modRR => "Rattus-norvegicus_RGD_version_protein-reps.fa.gz",
modSC => "Saccharomyces-cerevisiae_SGD_08272004_protein-reps.fa.gz",
## add 4 more from inparanoid collection
sanPF => "Plasmodium_falciparum_Pfa3D7.fa.gz",
modSP => "Schizosaccharomyces-pombe_GeneDB_Spombe_02042005_protein-reps.fa.gz",
ensTN => "Tetraodon_nigroviridis.TETRAODON7.dec.pep.fa.gz",
ncbEC => "Escherichia_coli_K12_NC_000913.fa.gz",
); # 22
#Escherichia_coli_K12_NC_000913.fa.gz
#Plasmodium_falciparum_Pfa3D7.fa.gz human malaria protozoan parasite
#Schizosaccharomyces-pombe_GeneDB_Spombe_02042005_protein-reps.fa.gz
#Tetraodon_nigroviridis.TETRAODON7.dec.pep.fa.gz freshwater pufferfish
my @protkeys= sort keys %proteomesf;
my ($input,$output,$flattenfastaorder);
my $ok= Getopt::Long::GetOptions(
'chromosome!' => \$doshredchr,
'proteomes!' => \$doprotformat,
'dnablast!' => \$dodnaformat,
'aablast!' => \$doaaformat,
'flattenfasta!' => \$flattenfastaorder,
'debug!' => \$debug,
'input=s' => \$input,
'output=s' => \$output,
#'format!' => \$doformat,
'fastadir=s' => \$fadir,
'dbdir=s' => \$dbdir,
'blid=s' => \$blid,
'shredsize=s' => \$shredsize,
'shredoverlap=s' => \$shredoverlap,
);
die <gi.\\d+.ref\\|(\\S+)/>$blid|\$1 \$1/){s/>(\\S+)/>$blid|\$1 \$1/;}' |\\
$formatdb -n $dbid $bopts -i stdin ");
}
if ($doprotformat) {
perOrgFormat($fadir,$dbdir,\%proteomesf);
}
#---------------------
sub perOrgFormat {
my($fadir,$dbdir,$dbset)= @_;
my ($dbid,$fa);
while ( ($dbid,$fa)= each(%$dbset) ) {
(my $title=$fa) =~ s/.gz$//;
if (-e "$dbdir/$dbid.phr") { warn "already done $dbdir/$dbid\n"; next; }
warn "formatdb -n $dbid -t $title $opts \n" if $debug;
#system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' > $dbid.fa");
system("gunzip -c $fadir$fa | \\
perl -pe'unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1 \$1/){s/>(\\S+)/>gnl|$dbid|\$1 \$1/;}' \\
> $dbdir$dbid.fa");
system("cd $dbdir; $formatdb -n $dbid -t $title $opts -i $dbid.fa ");
}
## fix this - bad for blast -o T
## >gnl|ncbAT|gi|15223276|ref|NP_171609.1|
## >gnl|ncbAT|NP_171609.1|
# unless(s/>gi\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;}
# unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;}
# system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' | $formatdb -n $dbid -t $title $opts -i stdin");
}
sub makeFastaDb {
my($fadir,$dbdir,$dbset)= @_;
my ($dbid,$fa);
while ( ($dbid,$fa)= each(%$dbset) ) {
# system("zcat $fa | perl -pe's/>(\\S+)/>gnl|$dbid|\$1/;' > $dbid.fa");
system("gunzip -c $fadir$fa | \\
perl -pe'unless(s/>gi.\\d+.ref\\|(\\S+)/>gnl|$dbid|\$1/){s/>(\\S+)/>gnl|$dbid|\$1/;}' \\
> $dbdir$dbid.fa");
}
}
sub putShred {
my($TO,$id,$def,$seq,$psize,$poverlap)= @_;
my ($start,$total,$i)= (0,0);
my $len= length($seq);
for($i=1; $start < $len; $i++) {
my $pseq;
if ($len - $start < $psize+$poverlap) {
if ($start == 0) { $pseq= $seq; }
else { $pseq= substr($seq,$start); }
$poverlap= $len - $start + 1;
} else {
$pseq= substr($seq,$start,$psize);
}
my $nb = length($pseq);
my ($istart,$stop)= ($start+1, $start + $nb);
#my $pdef= "start_${i}=$istart; len_${i}=$nb; overlap=$poverlap; $def"; # messy field names!
my $pdef= "pt=$i; pt_start=$istart; pt_len=$nb; overlap=$poverlap; $def";
my $pid = "${id}_${istart}_${stop}"; #? id = chr?; use Chr:start-stop format ?
$pseq =~ s/(.{1,50})/$1\n/g;
print $TO ">$pid $pdef\n",$pseq,"\n";
$start += $poverlap;
$total += $nb; # has overlapped
}
$i--;
warn "shredded: $id, len=$len to $i parts (size=$psize, totlen=$total)\n" if $debug;
}
# >2L type=chromosome; loc=2L:1..22407834; ID=2L; release=r4.1; species=dmel
sub shredChromosome {
my($fafile,$psize,$poverlap,$dbid)= @_;
$psize= 40000 unless($psize>0);
$poverlap= int($psize/2) unless($poverlap>0);
(my $to= $fafile) =~ s/\.fa\w*(.gz)?$//;
$to= substr($to,1+rindex($to,'/')) if ($to =~ m,/,); ##$to =~ s,.*/([^/]+)$,$1,;
$to .="-shredded-$psize-$poverlap.fa";
if ($fafile =~/.gz$/){ open(F,"gunzip -c $fafile|") or die" open $fafile"; }
else { open(F,$fafile) or die" open $fafile"; }
open(TO,">$to") or die "$to";
my($id,$seq,$def);
while() {
chomp;
if (/^>(\S+)\s*(.*)$/){
my($a,$b)=($1,$2);
if ($id && $seq) {
putShred(*TO,$id,$def,$seq,$psize,$poverlap);
}
$id=$a; $def=$b; $seq='';
$id="$dbid|$id" if $dbid;
}
elsif (/^\S/) {
$seq .= $_;
}
}
putShred(*TO,$id,$def,$seq,$psize,$poverlap) if ($id && $seq);
close(F); close(TO);
return $to;
}
=item homogenizeFastaSizeOrder
for parallel-blast, put fasta lib into
order where entry sizes are homogenized, so split by count
wont end up with a few par. parts that take way longer than others
need two passes; 1 = count fa lengths; 2 = rewrite in better order
=cut
sub homogenizeFastaSizeOrder {
my($infile,$outfile)= @_;
my($inh,$outh);
if (!$infile || $infile =~ m/^(stdin|\-)$/i) {
## no good, need file for randacc# $inh= *STDIN;
die "Input file required for homogenize fasta sizes";
}
else {
unless (open(R,$infile)) { warn "cannot read $infile"; return undef; }
$inh= *R;
}
warn "\nparsing $infile to $outfile\n" if $debug;
if (!$outfile || $outfile =~ m/^(stdout|\-)$/i) { $outh= *STDOUT; }
else {
unless (open(O,">$outfile")) { warn "cannot write $outfile"; return undef; }
$outh= *O;
}
# 1.count lens
my %locs=(); my %szbins=();
my($start, $at, $id, $size, $nentry, $totsize)= (0,0,undef,0,0,0);
while(<$inh>) {
if(/^>(\S+)/) {
my $id1= $1;
if ($size) {
my $szbin= 500 * int( $size / 500);
$szbins{$szbin}++;
$totsize += $size;
}
$locs{$id}= [$id, $start, $at - $start, $size] if ($id);
$id= $id1;
$start= $at;
$nentry++;
$size= 0;
}
else{
my $nc= tr/A-z/A-z/; $size += $nc;
}
$at= tell($inh);
}
$locs{$id}= [$id, $start, $at - $start, $size] if ($id); # last
# close($inh); open(R,$infile); $inh= *R;
# ?need special open for randome access?
# 2. reorder by sizes - how?
# ? assume lots, try just randomize ids ?
my $avesize= int( $totsize / $nentry);
my @ids= keys %locs;
my @sids= sort{ rand() <=> rand() } @ids;
warn "read $nentry records; ave size=$avesize\n" if $debug;
my %szbinout=();
my $nout= 0;
my $buf='';
foreach my $lc (@sids) {
my( $id, $start, $flen, $size)= @{$locs{$lc}};
$buf='';
seek($inh,$start,0);
read($inh,$buf,$flen);
print $outh $buf;
$nout++;
my $nbin= int(30 * $nout / $nentry);
$szbinout{$nbin} += $size;
}
close($outh); close($inh);
if ($debug) {
print STDERR "output size distribution:\n";
foreach my $bn (sort{$a <=> $b} keys %szbinout) {
print STDERR $bn,"=",$szbinout{$bn}," ";
}
print STDERR "\n";
}
}
__END__