#!/usr/bin/perl # ncbi_mapview2gff.pl # see http://promotion.molgen.mpg.de/gbrowse/contrib/import_ncbi_mv_hs.pl use strict; use warnings; use Getopt::Long; my $gffsource= "ncbigenomes"; my @ncbifields = # match your data qw( taxid chr start stop strand contig cstart cstop cstrand fname ftype group_label besthit_acc besthit_gi evd_code has5p has3p ); my %NCBI2SO_TYPE = ( GENE => 'mRNA', CDS => 'CDS', UTR => 'UTR', # five_prime_UTR or three_prime_UTR ... fixme PSEUDO => 'pseudogene', # this is flag or mRNA ? duplicates mRNA, evd=pseudo ? ); my $optok = GetOptions( "source=s"=>\$gffsource, ); die << "USAGE" unless($optok); ncbi_mapview2gff [-source $gffsource] < hmm_models.md > hmm_models.gff USAGE ncbi_mapview2gff(*STDIN); sub ncbi_mapview2gff { my($inh)= @_; print "##gff-version 3\n"; my @save=(); while(<$inh>){ next unless(/^\w/); # is it always ^\d taxid ? chomp; my %row = (); @row{ @ncbifields } = split "\t"; # [@field_positions] my $gfftype= $NCBI2SO_TYPE{ $row{ftype} } || $row{ftype}; my $ischild= ($gfftype =~ /CDS|exon|UTR/); my $id= $row{fname}; my $attr= ($ischild ? "Parent" : "ID") . "=$id"; my $dbx = join ",", map{ $row{$_} if $row{$_}; } qw(besthit_acc besthit_gi); $attr .= ";Dbxref=". $dbx if ($dbx =~ /\w/); $row{evd_code} =~ s/\;+/,/g; $row{evd_code} =~ s/\-//g; $row{evd_code} =~ s/ab initio/ab_initio/g; $attr .= ";evd=$row{evd_code}" if $row{evd_code}; my @gff= ($row{chr}, $gffsource, $gfftype, $row{start}, $row{stop}, '.', $row{strand}, '.', $attr); sub same { my($ga,$gb)=@_; for my $i (0..7) { return 0 if($$ga[$i] ne $$gb[$i]); } return 1; } if( $ischild && same( \@gff, \@save )) { $save[8] =~ s/Parent=([^;]+)/Parent=$1,$id/; } else { print join("\t",@save), "\n" if @save; @save= @gff; } # need to combine CDS, UTR, .. of same location, diff parents, as one row } print join("\t",@save), "\n" if @save; } __END__ microbe% zmore dmel5_ncbi_hmm_models.md.gz ------> dmel5_ncbi_hmm_models.md.gz <------ #tax_id chromosome chr_start chr_stop chr_orient contig ctg_start ctg_st op ctg_orient feature_name feature_type group_label best_hit_acc best_hit_gi evidence_code is_5p_complete is_3p_complete 7227 2L 6774 9489 + NT_033779.4 6774 9489 + hmm4134014 GENE reference AAZ86793.1 GI:73853446 psCDS;; N N