# SNAP training with PASA cDNA-assembly train genes # 08jun for pea aphid genome # Note: nodupgenes.gff is same pasa generated trainingSetGenes.gff as for augustus, others with similar genes removed by blat -minIndentity 70 of proteins # Note: training genes .gff, and .fasta need to correspond in order of scaffolds # i.e. no extra scaffolds w/o genes; order both by >scaffold number sort (or alpha sort...) # prepare snap train genes from pasa traininggenes.gff, genome.fa $bg/blast/gff2zff2.pl -hmm -absscore=10 < nodupgenes.gff > nodupgenes.zff $snap/fathom nodupgenes.zff ../Acyr20071212-genome.fa -validate | & more grep '^>' *zff | perl -pe's/>//;' > ! scaf.list # or use nodupgenes.gff for scaf.list cat scaf.list ../Acyr*fa | perl -ne'if(/^(SC\w+)/){$s{$1}++;$p=0;}elsif(/^>(\w+)/){ $p=($s{$1})?1:0;} print if($p);' \ > acyr-nodupgeno.fa $snap/fathom nodupgenes.zff acyr-nodupgeno.fa -validate | & more $bg/blast/gff2zff2.pl -hmm -absscore=10 < nodupgenes.gff > ! nodupgenes.zff $snap/fathom nodupgenes.zff acyr-nodupgeno.fa -validate | & more cat nodupgenes.gff | perl -ne's/SCAFFOLD/SCAFFOLD\t/; print if (/SCAF/);' | \ sort -k2,2n -k5,5n | perl -pe's/SCAFFOLD\t/SCAFFOLD/;' > nod.gff mv nod.gff nodupgenes.gff $bg/blast/gff2zff2.pl -hmm -absscore=10 < nodup*.gff > ! nodupgenes.zff $snap/fathom nodupgenes.zff acyr-nodupgeno.fa -validate | & more #.......................... # create best gene set (uni.ann, uni.dna) from genome,genes.zff $snap/fathom nodupgenes.zff acyr-nodupgeno.fa -categorize 1000 # create export.* training genes for SNAP $snap/fathom uni.ann uni.dna -export 1000 -plus # create HMM data set mkdir params cd params $snap/forge ../export.ann ../export.dna # assemble Snap HMM cd .. $snap/hmm-assembler.pl aphid params > aphid.hmm cp aphid.hmm $snap/HMM/aphid #-- make snap predictions ## really need snap -gff3 modification to newest snap code; other dgg changes? ## add snap -aa -tx outputs $snap/snap -quiet -gff aphid acyr-genome.fa > acyr-snap1.gff $snap/snap -name snap4 -quiet -gff3 aphid acyr-genome.fa > acyr-snap2.gff3 & perl -pi -e's/\tgene/\tmRNA/; s/\texon/\tCDS/; \ if(m/#SNAP-version.*/ and $sv++){s/#SNAP-version.*//; } \ if(m/##gff-version.*/ and $gf++) { s/##gff-version.*// ' acyr-snap2.gff3 cp -p acyr-snap2.gff3 $gbn/databases/aphid/ perl -Ilib bin/lucene_bulk_load_gff.pl --java lib/java/ --create \ --data databases/aphid/genome --fasta databases/aphid/*genome.fa databases/aphid/*.gff3 # mv mygbrowse.conf aphid.conf ; and edit to suit ... cp aphid.conf conf/gbrowse.conf/