#!/usr/bin/env perl
# genomepathmap.pl
=head1 ABOUT
genomepathmap.pl
Apache mod_rewrite map handler for common genome data path.
NOTE: RewriteMap is started/attached by mod_rewrite at apache-start-time
gets NO browser/caller info (unless we send with calling params)
Added to GMOD Bulkfiles toolset.
D. Gilbert, mar 2005
# --- /genome/README.html ------
Common genome URLS for Model Organism Databases will support automated retrievals:
/genome/dna OR /genome/genome = full genome dna in FastA format of primary organism
(D. melanogaster).
/genome/protein OR /genome/proteome = full genome protein set "
/genome/transcript OR /genome/transcriptome = full genome RNA transcript set "
/genome/features OR /genome/gff = full genome RNA feature set (GFF format)
/genome/{species-name}/{dna, protein and trancript} = full set for that species,
for the above listed species.
/genome/species = list available species
/genome/help = information page (i.e., this page, equivalent to /genome/ ).
D. melanogaster heterochromatin currently is handled as
a separate section: /genome/heterochromatin/{dna,protein,transcript}
OR /genome/Drosophila_melanogaster/heterochromatin/{...}
Support for alternate versions and formats may be added later.
# --- Apache httpd .conf ------
RewriteEngine on
## support for GMOD common /genome/ data urls, mar05
## NOTE: /genomes/ is now symlink to same data/genomes/ folder
## this does something, not right though
RewriteRule ^/genome$ /genome/
RewriteRule ^/genome/(.*) /data/genomes/$1 [PT]
RewriteMap genomedata-map prg:/bio/argos/flybase/cgi-bin/genomepathmap.pl
# -- these settings for local .htaccess (path relative)
# -- change to
RewriteRule help$ "" [L]
RewriteRule (species)$ /cgi-bin/genomepathmap.pl?%{REQUEST_URI} [L]
##RewriteCond %{REQUEST_URI} (dna|features|protein|transcript)$
RewriteCond %{REQUEST_URI} (dna|chromosome|genome|features|gff|protein|proteome|transcript|transcriptome)$
RewriteCond %{HTTP:Accept-encoding} gzip
RewriteCond ${genomedata-map:%{REQUEST_URI}/gzip} (.+)
RewriteRule . %1 [L]
# this works ok; will handle gzip-enabled and non-gz browsers
RewriteRule (dna|chromosome|genome|features|gff|protein|proteome|transcript|transcriptome)$ \
/cgi-bin/genomepathmap.pl?%{REQUEST_URI}
## this was problem for .cgi calls !
LayoutDefaultHandlers Off
LayoutHandler text/html
LayoutMerge Off
=cut
use strict;
use POSIX;
use constant DEBUG => 0;
#my $webroot="/data/genomes";
my $webroot="/genome";
my $realroot="$ENV{ARGOS_ROOT}/MY_SPECIES/web$webroot";
if ($ENV{DOCUMENT_ROOT}) { $realroot="$ENV{DOCUMENT_ROOT}$webroot"; }
my $genus= $ENV{GENUS} || "MY_GENUS"; #? could be regex pattern; check $ENV ? config files?
my $defaultspp= $ENV{SPECIES} || "MY_SPECIES";
my $LOGFILE="$realroot/url2file.log"; # debug only
## problem here -- get releases from file path
my %orgrel= (
# dmel => "r4.1",
# dpse => "r1.03",
);
my $ok_gzip= 0; ## assume apache map caller knows
$| = 1; # unbuffer for handleRewriteMap
if ($ENV{GATEWAY_INTERFACE} =~ /CGI/) {
handleCgi();
}
else {
## this call is for use from apache
## RewriteMap genomedata-map prg:/bio/argos/flybase/cgi-bin/genomepathmap.pl
handleRewriteMap();
}
sub getspecies {
my @spp=();
if (opendir(D,$realroot)) { @spp= grep(/[A-Z]\w*_\w+/,readdir(D)); closedir(D); }
return sort @spp;
}
sub handleCgi {
## add optional params: uri=%{REQUEST_URI} ; config= ; type_map= ? ;
my $pathinfo= shift @ARGV;
if ($pathinfo =~ /help$/) { notfound($pathinfo); } # shouldn't be here
elsif ($pathinfo =~/species$/) { # list
print STDOUT "Content-Type: text/plain\n\n";
print STDOUT "# $pathinfo \n";
my @spp= getspecies();
print STDOUT join("\n",@spp),"\n";
exit;
}
# elsif ($pathinfo =~/release$/) { # list; check for species in pathinfo
# print STDOUT "Content-Type: text/plain\n\n";
# print STDOUT "# $pathinfo \n";
# my @rels=();
# my @spp= getspecies();
# foreach my $spp (@spp) {
# if (opendir(D,"$realroot/$spp")) {
# push @rels, map { "$spp/$_"; } grep(/^\w/,readdir(D));
# closedir(D);
# }
# }
# print STDOUT join("\n",@rels),"\n";
# exit;
# }
$ok_gzip= ($ENV{HTTP_ACCEPT_ENCODING} =~ /gzip/);
my ($path,$file)= getdatapath($pathinfo); # returns relpath/file w/o .gz
my $pathtrans="$realroot/$path";
if ($ok_gzip && -f "$pathtrans.gz") {
$pathtrans= "$pathtrans.gz";
}
elsif (-f $pathtrans) {
#?? make gzip if $ok_gzip
}
elsif (-f "$pathtrans.gz") {
$pathtrans= "$pathtrans.gz";
}
else { notfound($file); } # exits
my $doget= ($ENV{'REQUEST_METHOD'} =~ m/GET|POST/);
my $flen = -s $pathtrans;
my $fdate= -M $pathtrans;
my @tm= gmtime( $^T - 24*60*60*$fdate );
$fdate= POSIX::strftime( "%a, %d %b %Y %T GMT", @tm); ## apache is picky about format here
## see HTTP::Date: sub time2str (;$)
## Last-Modified: Tue, 15 Feb 2005 02:10:15 GMT
my $contype="text/plain";
if ($file =~ /fasta/) { $contype="application/x-fasta"; }
elsif($file =~ /gff/) { $contype="application/x-gff3"; }
print STDERR "genomepathmap.cgi $pathinfo => $fdate, $path\n" if DEBUG;
if ($ok_gzip && $pathtrans =~ /.gz$/) {
print STDOUT "Content-Disposition: attachment; filename=$file.gz\n";
print STDOUT "Last-Modified: $fdate\n";
print STDOUT "Content-Length: $flen\n";
print STDOUT "Content-Encoding: gzip\n";
print STDOUT "Content-Type: $contype\n\n";
##print STDOUT "Content-Type: text/plain\n\n";
# ^^ wormbase spec says: MIME type is application/x-fasta or application/x-gff3
if ($doget && open(F, $pathtrans)) {
my ($n,$buf,$nt);
binmode(F);
binmode(STDOUT);
## apache-melon is trashing gzip output above; closes after 8 bytes received; read is ok
# while( ($n= read(F,$buf,16384))>0 ) { print STDOUT $buf; } # $nt += $n;
while( ($n= sysread(F,$buf,16384))>0 ) { syswrite STDOUT,$buf,$n; $nt += $n; }
close(F);
print STDERR "url2gnofile $pathinfo ; nt=$nt\n";
}
}
else {
print STDOUT "Content-Disposition: attachment; filename=$file\n";
print STDOUT "Last-Modified: $fdate\n";
if ($pathtrans =~ /.gz$/) {
print STDOUT "Content-Type: $contype\n\n";
if ($doget && open(F,"gunzip -c $pathtrans|")){ print STDOUT join("",); close(F); }
}
else {
print STDOUT "Content-Length: $flen\n";
print STDOUT "Content-Type: $contype\n\n";
if ($doget && open(F,"$pathtrans")){ print STDOUT join("",); close(F); }
}
}
}
sub handleRewriteMap {
if (DEBUG && $LOGFILE){
open(LOG,">>$LOGFILE");
print LOG "getfile ENV:\n",join("\n ",map{ "$_ => $ENV{$_}" } sort keys %ENV),"\n";
close(LOG);
}
$| = 1; # unbuffer for handleRewriteMap
while()
{
if ($_ =~ s,/gzip$,,) { $ok_gzip= 1; } else { $ok_gzip= 0; } #? tack onto call path
my $path= getdatapath($_); # returns relpath/file w/o .gz
my $pathtrans="NULL";
if ($ok_gzip && -f "$realroot/$path.gz") {
$pathtrans="$webroot/$path.gz";
}
elsif (-f "$realroot/$path") {
$pathtrans= "$webroot/$path";
}
print STDOUT "$pathtrans\n";
if (DEBUG && $LOGFILE){ open(LOG,">>$LOGFILE"); print LOG "genomepath $_ => $path,$pathtrans\n"; close(LOG); }
}
}
sub notfound {
my($file)=@_;
print STDOUT qq{Status: 404 Not Found
Content-Type: text/html
404 Not Found
Not Found
The requested URL $file was not found on this server.
};
exit(0);
}
# sub getrelease {
# my( $uri, $org, $path, $filepatt, $suff)= @_;
# my $rel;
#
# ##hack##if ($uri =~ /hetero/) { $org= $org.'het'; } ## $org eq "dmel" &&
# ##if ($org eq "dmel" && $uri =~ /hetero/) { $rel= $orgrel{$org.'het'}; }
#
# $rel= $orgrel{$org};
# if ($rel && $rel !~ /^-/) { $rel="-$rel"; }
#
# #check path for $org-release(.*).txt ??
# unless($rel)
# {
# if(opendir(D,"$realroot/$path")) {
# my ($relf,@rels)= grep(/^$filepatt/, readdir(D));
# closedir(D);
# $relf =~ s/^$filepatt//;
# $relf =~ s/$suff.*$//;
# $rel= $relf;
# $orgrel{$org}= $rel;
# }
# }
# return $rel;
# }
sub getreleasefile {
my( $uri, $path, $filepatt, $suff)= @_;
my($file)=('');
if(opendir(D,"$realroot/$path")) {
my @files= grep(/$filepatt/, readdir(D));
closedir(D);
## need .gz check; more, do what?
($file,my @more) = grep(/$suff(.gz)?$/,@files);
$file=~s/\.gz$//; # handler checks for .gz
}
return $file;
}
## FIXME ....
sub getdatapath {
my($uri)= @_;
my $format="fasta";
my $spp= $defaultspp;
my $org=""; ##"dmel";
my $part="-all";
my $folder="current";
if ($uri =~ m,/(${genus}_\w+),) { $spp= $1; }
elsif ($uri =~ m,/([A-Z]\w+_\w+),) { $spp= $1; }
if ($spp =~ m/^(\w)[^_]*_(\w{1,3})/) {
$org= lc("$1$2"); # Gspp 4 letter abbrev.
}
if ($uri =~ m,/$spp/(\w[^/]+)/.+,) { $folder= $1; }
my $type="NULL"; ## need to fail if no primary verb: dna,protein,transcript or alias
if ($uri =~ /(gff|feature)$/i) { $format="gff"; $type=""; }
elsif($uri =~ /(genome|dna|chromosome)$/i) { $type="chromosome"; }
elsif($uri =~ /(protein|proteome|translation)$/i) { $type="translation"; }
elsif($uri =~ /(mrna|transcript|transcriptome)$/i) { $type="transcript"; }
elsif($uri =~ /(ncrna|miscrna)$/i) { $type="miscRNA"; } ## change ot ncRNA
## ^^ need to know what fasta labels are being used: ncRNA? miscRNA? tRNA ?
## else error ?
# hack needs some config file... check in $spp/ path ?
if ($spp =~ /Saccharomyces/) {
$type =~ s/transcript/gene/;
$type =~ s/miscRNA/ncRNA/;
## no protein data here ??
}
## fixme - also check $uri for real version-folder
## e.g. if ($uri =~ /(version|release)([\w..])/)
my $path = "$spp/$folder/$format";
my $filepatt = $part;
$filepatt .= "-$type" if $type;
## need .gz, no.gz option, check for both
my $file= getreleasefile( $uri, $path, $filepatt,".$format");
$path = "$path/$file";
return (wantarray) ? ($path,$file) : $path;
}