#!/usr/bin/perl # aug2mapgff.pl use strict; use Getopt::Long; # qw(:config no_ignore_case bundling); my $exp= $ENV{n} || "0"; my $idprefix= $ENV{idprefix} || ''; ## || "AUG${exp}_"; my $noconstraint= defined $ENV{con} ? $ENV{con} : 0; my $nofasta= defined $ENV{fa} ? $ENV{fa} : 0; my $aa_attribute=0; my $uniqueid= 1; #? my $fastaonly=0; my $optok= &GetOptions ( 'source|exp=s' => \$exp, 'idprefix=s' => \$idprefix, 'noconstraint' => \$noconstraint, #dgg 'nofasta' => \$nofasta, 'fastaonly' => \$fastaonly, 'aa_attribute!' => \$aa_attribute, 'uniqueid!' => \$uniqueid, ); die "usage: aug2mapgff < augustus.gff > map.gff options: --exp|source=expname --idprefix=AUGexp --nouniqueid --aa_attribute --nofasta --fastaonly --noconstraint protein aa will be added at end as ##FASTA unless -aa_att or -nofasta; -fastaonly outputs protein.fasta example: -exp=1 -aa -nocon SCAFFOLD33 AUGUSTUS.1 mRNA 33647 34810 0.68 + . ID=AUG1_g2.t1;evd_pG=LOC10 0115055,FGPP_ORF_2_from_SCAFFOLD33;evd_pP=Tcas:GLEAN_15479;protein=MSCNLCKCSSDGNYAACTFMQCFDFNFEEEQRSKRSTNE VVAKLSSDIPRISGYTQGDQCPSKSFYNDCNMCVCGPDDASAACTMMMCMPGETQQPSKIVPAKLNDIARIGPGMPRRRVLPR SCAFFOLD33 AUGUSTUS.1 exon 33647 33656 . + . Parent=AUG1_g2.t1 SCAFFOLD33 AUGUSTUS.1 exon 33734 33769 . + . Parent=AUG1_g2.t1 SCAFFOLD33 AUGUSTUS.1 exon 33865 34005 . + . Parent=AUG1_g2.t1 SCAFFOLD33 AUGUSTUS.1 CDS 33912 34005 0.72 + 0 Parent=AUG1_g2.t1 SCAFFOLD33 AUGUSTUS.1 CDS 34116 34175 1 + 2 Parent=AUG1_g2.t1 \n" unless($optok); my (@gff,@attr,%hints,%prots,%seengene); my ($aa,$psup,$supexons,$supintrons,$suputr5,$suputr3,$hintfull,$hintpart); my ($ingene, $incon, $trid, $oldid); $nofasta=1 if($aa_attribute); $idprefix= "AUG${exp}_" unless($idprefix); if($fastaonly) { $nofasta=0; $aa_attribute=0; $noconstraint=1; } sub put_transcript { if($incon) { # not transcript @gff=() if($noconstraint); } else { push(@attr,"pct_support=$psup") if(defined $psup); foreach my $k (sort keys %hints) { my $v= $hints{$k}; push(@attr, "evd_${k}=$v"); } if($aa) { $prots{$trid}= $aa ; $aa=""; } if($aa_attribute) { my $aa= $prots{$trid}; push(@attr,"protein=".$aa) if($aa); } if(@attr) { my $attr= join(";",@attr); foreach (@gff) { s/$/;$attr/ if(/\tmRNA/); } } } @gff=() if $fastaonly; print @gff if(@gff); @gff=(); @attr=(); %hints=(); ($aa,$psup,$supexons,$supintrons,$suputr5,$suputr3,$hintfull,$hintpart)= (0) x 20; } sub put_fasta { print "#\n" unless($fastaonly); print "##FASTA\n" unless($fastaonly); foreach my $id (sort keys %prots) { my $fa= $prots{$id}; $fa =~ s/(.{1,60})/$1\n/g; print ">$id\n$fa\n"; } } # G: 19 (GLEAN_18911,GH_DGIL_SNO_28281162,GH_NCBI_GNO_32001285,TRdgri_22378,GH_BREN_NSC_50051886,...) # P: 3 (CG31663_G1,CG31663_G2,CG7295_G1) # hint groups fully obeyed: 7 # T: 7 (tar16108,tar16109,tar16110,tar16111,tar16112,tar16113,tar16114) # incompatible hint groups: 1 # T: 1 (tar16115) sub hintids { my($h)= @_; my($t, $n)= $h =~ m/^#\s+(\w+):\s+(\d+)/; my($v)= $h =~ m/\s+\(([^\)]+)\)/; $v =~ s/,\.\.\.//; #my @v=split ",",$v; return ($t,$n,$v); } while(<>){ if(/^#/) { if(/### gene (\w+)/) { $ingene=$1; $incon=0; } elsif(/# Constraints/) { $incon=1; } elsif(/### end gene/) { put_transcript() if(@gff); $ingene=0; } elsif(/##gff-version/){ print unless($fastaonly); } elsif(/# This output was generated with/){ print unless($fastaonly); } elsif(/# Evidence for and against this transcript/){ # push(@attr,"protein=".$aa) if($aa); $aa=""; $prots{$trid}= $aa if($aa); $aa=""; } elsif(/transcript supported by hints .any source.: (\S+)/){$psup=$1;} elsif(/CDS exons: (\S+)/){ $supexons=$1; } elsif(/CDS introns: (\S+)/){ $supintrons=$1; } elsif(/5'UTR exons and introns: (\S+)/){ $suputr5=$1; } elsif(/3'UTR exons and introns: (\S+)/){ $suputr3=$1; } ### need to collect type counts for each sup type? elsif(/hint groups fully obeyed: (\S+)/){ $hintfull=$1; $hintpart=0; } elsif(/incompatible hint groups: (\S+)/){ $hintpart=$1; $hintfull=0; } elsif(($hintfull or $hintpart) and m/^#\s+(\w+):\s+(\d+)/) { my($t,$n,$v)= hintids($_); my $c= $hintfull ? "f" : "p"; $hints{$c.$t}=$v; } elsif(/# protein sequence = .(\w+)/) { $aa=$1; } elsif($aa and /# ([A-Z]+)/) { $aa.=$1; } # last patt? } elsif(/^\w/){ my $p= 1; ## trap error messages: Error in addGene if($incon) { s/grp=/ID=/; s/\tep/\texon_region/; # should use SO terms : region s/\tip/\tintron_region/; s/\tirpart/\tintergenic_region/; s/ "[^"]+"//; $p=0 if($noconstraint); } elsif($ingene) { if(m/\tCDS/){ s/ID=[^;\s]+;?//; } s/\tAUGUSTUS/\tAUGUSTUS.$exp/; ##next if(/\ttranscription_start_site/); my $prefix= $idprefix; if($uniqueid) { # check for/add ref/scaffold/chr num prefix? my($ref)= m/^(\w+)/; (my $sref=$ref) =~ s/^(\w)\D+/$1/; # Scaffold123 -> S123 my($id)= m/(?:Parent|ID)=([^;\s]+)/; unless($id =~ m/$ref/ or $id =~ m/$sref/ or $prefix =~ m/$sref/) { #$ref =~ s/^(\w)(\D+)/$1/; $prefix= $idprefix.$sref; } } s/(ID|Parent)=/$1=$prefix/g if($prefix); if(s/\ttranscript\t/\tmRNA\t/){ put_transcript() if(@gff); s/;Parent=[^;\s]+//; ($trid)= m/ID=([^;\s]+)/; $oldid= $trid; if($uniqueid && $seengene{$trid}++) { my $did= "d" . $seengene{$oldid}; unless($trid =~ s/$prefix/$prefix$did/) { $trid = $did.$trid; } s/=$oldid\b/=$trid/g; } } else { if($oldid && $trid ne $oldid) { s/=$oldid\b/=$trid/g; } } $p= m/\t(gene|intron|start_codon|stop_codon|transcription_start_site|transcription_end_site)/ ? 0:1; } $p=0 if(/^Error/); push(@gff,$_) if $p; } } put_transcript() if(@gff); put_fasta() unless($nofasta); __END__ ## add in hints used comments for evidence Dbxref attribs # Evidence for and against this transcript: # % of transcript supported by hints (any source): 87.1 # CDS exons: 14/14 # G: 14 # P: 6 # CDS introns: 13/14 # G: 13 # 5'UTR exons and introns: 0/2 # 3'UTR exons and introns: 0/1 # hint groups fully obeyed: 2 # G: 1 (FGPP_ORF_1_from_SCAFFOLD15) # P: 1 (Amel:GB12599-PA) # incompatible hint groups: 4 # G: 1 (276244) # P: 3 (Amel:GB11078-PA,Human:NP_478144,Tcas:GLEAN_02629) ### gene g316 scaffold_4 AUGUSTUS gene 1420609 1422424 0.21 + . ID=g316 scaffold_4 AUGUSTUS transcript 1420609 1422424 0.21 + . ID=g316.t1;Parent=g316 scaffold_4 AUGUSTUS exon 1420609 1422424 . + . Parent=g316.t1 scaffold_4 AUGUSTUS start_codon 1421469 1421471 . + 0 Parent=g316.t1 scaffold_4 AUGUSTUS CDS 1421469 1421723 0.71 + 0 ID=g316.t1.cds;Parent=g316.t1 # protein sequence = [MKELMKQLPVVKPLSTKEPAEKIALFQCRLLLPKSSPLQGEVVGDPMLTKRLAKRAAALRACERLHQLKELDDLHLLP # VSHRKR] # Evidence for and against this transcript: # % of transcript supported by hints (any source): 100 # CDS exons: 1/1 # T: 1 # CDS introns: 0/0 # 5'UTR exons and introns: 1/1 # T: 1 # 3'UTR exons and introns: 1/1 # T: 1 # hint groups fully obeyed: 5 # T: 5 (tar15253,tar15254,tar15255,tar15256,tar15257) # incompatible hint groups: 0 ### end gene g316