#!/usr/bin/perl eval 'exec /usr/bin/perl -S $0 ${1+"$@"}' if 0; # not running under some shell use strict; # use lib './blib/lib'; # use DBI; use IO::File; use Getopt::Long; use Bio::DB::GFF; use Bio::DB::GFF::Util::Binning 'bin'; #? use Bio::DB::GFF::Adaptor::dbi::mysqlopt; use Bio::DB::GFF::Adaptor::lucene; # use constant MYSQL => 'mysql'; # use constant FDATA => 'fdata'; # use constant FTYPE => 'ftype'; # use constant FGROUP => 'fgroup'; use constant FDNA => 'fdna'; # use constant FATTRIBUTE => 'fattribute'; # use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature'; use constant LUCENE => 'lucene'; use constant LUDATA => 'ludata'; =head1 MODIFIED revision of gff-mysql bulk loader for lucene gff adaptor Basically writes temp files like mysql loader, but formatted for input to lucene indexer. Oct 2005, Don Gilbert, gilbertd@indiana.edu =cut =head1 NAME lu_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files. =head1 SYNOPSIS % bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ... =head1 DESCRIPTION This script loads a Bio::DB::GFF database with the features contained in a list of GFF files and/or FASTA sequence files. You must use the exact variant of GFF described in L. Various command-line options allow you to control which database to load and whether to allow an existing database to be overwritten. This script differs from bp_load_gff.pl in that it is hard-coded to use MySQL and cannot perform incremental loads. See L for an incremental loader that works with all databases supported by Bio::DB::GFF, and L for a MySQL loader that supports fast incremental loads. =head2 NOTES If the filename is given as "-" then the input is taken from standard input. Compressed files (.gz, .Z, .bz2) are automatically uncompressed. FASTA format files are distinguished from GFF files by their filename extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their uppercase variants are treated as FASTA files. Everything else is treated as a GFF file. If you wish to load -fasta files from STDIN, then use the -f command-line swith with an argument of '-', as in gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f - The nature of the bulk load requires that the database be on the local machine and that the indicated user have the "file" privilege to load the tables and have enough room in /usr/tmp (or whatever is specified by the \$TMPDIR environment variable), to hold the tables transiently. Local data may now be uploaded to a remote server via the --local option with the database host specified in the dsn, e.g. dbi:mysql:test:db_host The adaptor used is dbi::mysqlopt. There is currently no way to change this. About maxfeature: the default value is 100,000,000 bases. If you have features that are close to or greater that 100Mb in length, then the value of maxbin should be increased to 1,000,000,000. Note that Windows users must use the --create option. If the list of GFF or fasta files exceeds the kernel limit for the maximum number of command-line arguments, use the --long_list /path/to/files option. =head1 COMMAND-LINE OPTIONS Command-line options can be abbreviated to single-letter options. e.g. -d instead of --database. --database Lucene database path-name (default ... ) --create Reinitialize/create data tables without asking --append Add to data tables without asking --fasta File or directory containing fasta files to load --javalib Directory containing lucene.jar --long_list Directory containing a very large number of GFF and/or FASTA files ## --user Username to log in as ## --password Password to use for authentication ## --local Flag to indicate that the data source is local --maxfeature Set the value of the maximum feature size --group A list of one or more tag names (comma or space separated) to be used for grouping in the 9th column. =head1 SEE ALSO L, L, L =head1 AUTHOR Lincoln Stein, lstein@cshl.org Copyright (c) 2002 Cold Spring Harbor Laboratory This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut package Bio::DB::GFF::Adaptor::faux; # use Bio::DB::GFF::Adaptor::dbi::mysqlopt; use Bio::DB::GFF::Adaptor::lucene; use vars '@ISA'; #? @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt'; @ISA = 'Bio::DB::GFF::Adaptor::lucene'; sub insert_sequence { my $self = shift; my ($id,$offset,$seq) = @_; print join("\t",$id,$offset,$seq),"\n"; } package main; # NEED this patch to bp_bulk_load_gff for GFF.pm -- dgg; # fly ID=mrnaId:exon1 is trashed by bioperl version ## this is bad?? why : returns 'ARRAY' not (name,class) sub patch_GFF3_wormnamemunging { local $^W = 0; if (1) { # $adaptor eq 'Bio::DB::GFF' eval <<'END'; use Bio::DB::GFF; sub Bio::DB::GFF::_gff3_name_munging { return @_; } ## no; problems seems with parent multiple returning (\@name,\@lclass); END warn $@ if $@; } } patch_GFF3_wormnamemunging(); eval "use Time::HiRes"; undef $@; my $timer = defined &Time::HiRes::time; my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system? if ($^O =~ /MSWin32/i) { $bWINDOWS = 1; } my ($DSN,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST); my ($JAVA_LIB,$APPEND); GetOptions ('database:s' => \$DSN, 'create' => \$FORCE, 'append' => \$APPEND, 'javalib=s' => \$JAVA_LIB, 'fasta:s' => \$FASTA, # 'local' => \$LOCAL, 'maxbin|maxfeature:s' => \$MAX_BIN, 'group:s' => \$GROUP_TAG, 'long_list:s' => \$LONG_LIST ) or (system('pod2text', $0), exit -1); $DSN ||= 'lucenetest'; unless(-e "$JAVA_LIB/lucene.jar") { die "Missing $JAVA_LIB/lucene.jar Please locate lucene.jar with -java /path/to/java/lib"; } if ($bWINDOWS && not $FORCE) { die "Note that Windows users must use the --create option.\n"; } unless ($FORCE||$APPEND) { die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create or --append option.\n" if $bWINDOWS; open (TTY,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability print STDERR "This operation will delete all existing data in database $DSN. Continue? "; my $f = ; die "Aborted\n" unless $f =~ /^[yY]/; close TTY; } # my (@auth,$AUTH); # if (defined $USER) { # push @auth,(-user=>$USER); # $AUTH .= " -u$USER"; # } # if (defined $PASSWORD) { # push @auth,(-pass=>$PASSWORD); # $AUTH .= " -p$PASSWORD"; # } $DSN=~s/database=//i; $DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host" # my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN; # if (defined $HOST) { # ## dgg patch # if ($HOST =~ s/;port=(\d+)//) { $AUTH .= " -P$1"; } # if ($HOST =~ s/;mysql_socket=([^;\s]+)//) { $AUTH .= " -S$1"; } # $AUTH .= " -h'$HOST'"; # dgg quoted HOST for commandline ; is dsn with semicolons; # } # $DBNAME=$DSN unless $DSN=~/:/; # if (defined $DBNAME) { # $AUTH .= " -D$DBNAME"; # } # if (defined $LOCAL) { # $LOCAL='local'; # $AUTH.=' --local-infile=1'; # }else { # $LOCAL=''; # } my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN, -write => 1, -create => $FORCE) or die "Can't open database: ",Bio::DB::GFF->error,"\n"; my $erase= ($FORCE || !$APPEND)?1:0; $MAX_BIN ? $db->initialize(-erase=>$erase,-MAX_BIN=>$MAX_BIN) : $db->initialize(-erase=>$erase); # $MAX_BIN ||= $db->meta('max_bin') || 100_000_000; $MAX_BIN ||= $db->MAX_BIN; # deal with really long lists of files if ($LONG_LIST) { -d $LONG_LIST or die "The --long_list argument must be a directory\n"; opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!"; @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR; closedir GFFDIR; if (defined $FASTA && -d $FASTA) { opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!"; push @ARGV, map { "$FASTA\/$_" } readdir FASTA; closedir FASTA; } elsif (defined $FASTA && -f $FASTA) { push @ARGV, $FASTA; } } foreach (@ARGV) { $_ = "gunzip -c $_ |" if /\.gz$/; $_ = "uncompress -c $_ |" if /\.Z$/; $_ = "bunzip2 -c $_ |" if /\.bz2$/; } my (@gff,@fasta); foreach (@ARGV) { if (/\.(fa|fasta|dna|seq|fast)/i) { push @fasta,$_; } else { push @gff,$_; } } @ARGV = @gff; push @fasta,$FASTA if defined $FASTA; # drop everything that was there before my %FH; my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || '/var/tmp'; $tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation # my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE); # foreach (@files) { # $FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!"; # $FH{$_}->autoflush; # } my @files = (LUDATA,FDNA); foreach (@files) { $FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!"; $FH{$_}->autoflush; } my $pid= $$; # dgg, for mult files generated, load separately my $FID = 1; #$pid; my $GID = 1; #$pid; my $FTYPEID = 1; #$pid; # no, use one typeid among files ?? but need same order .. rats my $ATTRIBUTEID = 1;# $pid; # likewise, attribs shared among chr files my %GROUPID = (); my %FTYPEID = (); my %ATTRIBUTEID = (); my %DONE = (); my $FEATURES = 0; my $count; my $fasta_sequence_id; my $gff3; my @OLD_gff_fields= qw( ref start stop bin score strand phase tstart tstop gclass gname method source feature_id group_id ); my @gff_fields= qw( ref start stop source method score strand phase gclass gname tstart tstop feature_id group_id bin ); =item gff_fields @gff_fields MUST MATCH @Bio::DB::GFF::Adaptor::memory::feature_serializer::hash2array_map =cut $db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG; my $last = Time::HiRes::time() if $timer; my $start = $last; # avoid hanging on standalone --fasta load if (!@ARGV) { $FH{NULL} = IO::File->new(">$tmpdir/null"); push @ARGV, "$tmpdir/null"; } while (<>) { chomp; my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group); # close sequence filehandle if required if ( /^\#|\s+|^$|^>/ && defined $FH{FASTA}) { $FH{FASTA}->close; delete $FH{FASTA}; } # print to fasta file if the handle is open if ( defined $FH{FASTA} ) { $FH{FASTA}->print("$_\n"); next; } elsif (/^>(\S+)/) { # uh oh, sequence coming $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n"; $FH{FASTA}->print("$_\n"); print STDERR "Preparing embedded sequence $1\n"; push @fasta, "$tmpdir/$1\.fa"; next; } elsif (/^\#\#\s*gff-version\s+3/) { $gff3++; next; } elsif (/^\#\#\s*group-tags\s+(.+)/) { $db->preferred_groups(split(/\s+/,$1)); next; } elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1")); } elsif (/^\#/) { next; } else { ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; } next unless defined $ref; $FEATURES++; my $size = $stop-$start+1; warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher value.\n" if $size > $MAX_BIN; $score = undef if $score eq '.'; $strand = undef if $strand eq '.'; $phase = undef if $phase eq '.'; my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3); $group_class ||= undef; # '\N'; $group_name ||= undef; $target_start ||= undef; $target_stop ||= undef; $method ||= undef; $source ||= undef; unless (ref($group_name) =~ /ARRAY/) { #dgg $group_name= [$group_name]; $group_class= [$group_class]; } ## add loop over groups; dgg my $fid; my $lucindex= $FH{ LUDATA() }; foreach (1..scalar(@$group_name)) { my $gname = shift @$group_name; my $gclass= shift @$group_class ; $fid = $FID++; my $gid = $GROUPID{$gclass,$gname} ||= $GID++; my $ftypeid = $FTYPEID{$source,$method} ||= $FTYPEID++; my $bin = bin($start,$stop,$db->MIN_BIN); $bin = $db->normalizeNumber($bin); #? need this my $type = join(':',$method,$source); # $FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" ); # $FH{ FGROUP() }->print( join("\t",$gid,$gclass,$gname),"\n" ) unless $DONE{"G$gid"}++; # $FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++; print $lucindex "i type:$type\n"; # so isnt split on :, or use "" quote search? my %gff_vals=(); # !! @gff_fields MUST MATCH # @Bio::DB::GFF::Adaptor::memory::feature_serializer::hash2array_map @gff_vals{@gff_fields}= ( $ref, $start, $stop, $source, $method, $score, $strand, $phase, $gclass, $gname, $target_start, $target_stop, $fid, $gid, $bin ); foreach my $f (@gff_fields) { my $v= $gff_vals{$f}; print $lucindex "i $f:$v\n" if(defined $v); } print $lucindex join("\n", map { "i ".lc(join(':',$_->[0],$_->[1])) } @$attributes), "\n" if (ref $attributes); ## lucene::feature2stringT($featurehash) my @gff_vals= @gff_vals{@gff_fields}; push @gff_vals,map {join "=",@$_} @$attributes if $attributes; my $fs= join "\t",@gff_vals; print $lucindex "t contents:",$fs,"\n"; print $lucindex "add\n"; } # foreach (@$attributes) { # my ($key,$value) = @$_; # my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++; # $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++; # $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n"); # } if ( $fid % 1000 == 0) { my $now = Time::HiRes::time() if $timer; my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : ''; $last = $now; print STDERR "$fid features parsed$elapsed..."; print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; } } my $didfasta=0; for my $file (@fasta) { warn "Preparing DNA file $file....\n"; my $old = select($FH{FDNA()}); $db->load_fasta($file); $didfasta++; warn "done...\n"; select $old; } $_->close foreach values %FH; printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer; warn "Loading feature data. You may see duplicate key warnings here...\n"; $start = time(); my $success = 1; my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : ''; ## $db->close(); ## close/clear lucene indexwrite lock from fasta; $db->_close_databases(); # what the heck is perl .close equiv undef $db; foreach (LUDATA()) { my $ludata= "$tmpdir/$_.$$"; my $create = ($didfasta) ? "" : ($FORCE) ? "-create" : ""; my $command =<