#!/usr/bin/perl # alttrlist.perl # use: perl alttrlist.perl < exons.fasta > exons-altr.ids # recheck tandy exons for overlapping alternate transcripts; write ID list # doesnt seem to be a problem; tandemgenes.pl is doing its filter ok use strict; use warnings; my $igpartialexon=1; my $groupok=1; our $BINSIZE = 1000 ; #was# 5000; #open(GREP, "grep '^>' $queryfa|") or return $altids; # die "grep $queryfa"; my ($atref, $atstrand, %didtypeloc, %dididtr, %alttr, %isalttr); while(<>) { my($xid)=m/>(\S+)/ or next; my($aid); if(m/altids=(\S+)/){$aid=$1;} my($ref,$strand,$loc)=(0,0,"",""); if(m/loc=([^;\s]+)/) { ($ref,$loc,$strand)= split(/:/,$1); ($atref, $atstrand)= ($ref,$strand); } my($gid,$xn,$xb,$xe) = $xid =~ m/^(\w+)\.(\d+):(\d+)-(\d+)/; unless($gid && $xe) { ($gid,$xb,$xe) = $xid =~ m/^(\w+):(\d+)-(\d+)/; $xn=1; } warn "bad xid: $gid.$xn:$xb-$xe = $xid\n" unless($gid && $xe); my $isdup= $alttr{$gid} || filter_genegroup_overlaps( $ref, $gid, $xn, $xb, $xe); if($aid) { $aid =~ s/pm,.*$// if ($igpartialexon); # no partials here ?? my @altids= grep{ (m/^\w+\.\d+:\d+\-\d+/) } split(",", $aid); #? exclude $xid here or not # $newaltids->{$xid}= \@altids; foreach my $ad (@altids) { my($agid,$axn,$axb,$axe) = $ad =~ m/^(\w+)\.(\d+):(\d+)-(\d+)/; unless($agid && $axe) { ($agid,$axb,$axe) = $xid =~ m/^(\w+):(\d+)-(\d+)/; $axn=1; } my $isdup= $alttr{$agid} || filter_genegroup_overlaps( $ref, $agid, $axn, $axb, $axe); } } } foreach my $ftype (sort keys %isalttr) { my @altr= sort keys %{$isalttr{$ftype}}; my $na= @altr; print "# group $ftype; n.alt_transcripts=$na\n"; map{ print "$_\n"; }@altr; } #................ sub _isoverlap { my($gb,$ge, $qb,$qe)= @_; return ($gb <= $qe && $ge >= $qb) ? 1 : 0; } sub filter_genegroup_overlaps { my($ref, $gid,$xn,$ab,$ae)= @_; # global: %didtypeloc; reset per scaffold # return 0 unless($FILTER_GENEGROUP_OVERLAPS); return 0 if($dididtr{$gid.$xn}++); # dont count same exon twice $ref ||="thisscaffold"; my $loc= "$ab-$ae"; my($ftype)= ($groupok) ? $gid =~ m/^(\D+)/ : ("all"); my $isolap=0; my @bins= (int($ab/$BINSIZE) .. int($ae/$BINSIZE)); LBIN: foreach my $ib (@bins) { $didtypeloc{$ftype.$ref}{$ib} or next; my @locs= @{$didtypeloc{$ftype.$ref}{$ib}}; foreach my $lc (@locs) { my ($lb,$le)= split "-",$lc; if(_isoverlap($ab,$ae,$lb,$le)) { $isolap=1; last LBIN; } } } if ($isolap) { $isalttr{$ftype}{$gid}++; $alttr{$gid}++; return 1; } foreach my $ib (@bins) { push @{$didtypeloc{$ftype.$ref}{$ib}}, $loc; } return 0; }