// Biogridlet.java
// should be
// package iubio.grid;
import java.io.*;
import java.util.*;
import java.net.*;
import javax.naming.*;
import javax.naming.directory.*;
/**
Biogridlet - basic biogrid toolkit component
a basic directory access component for bioinformatics grids.
Simple test of a "Gridlet" for bio data directory access. For each compute node
on your test grid, do this:
- Install/test/locate
NCBI BLAST software (yet to do as a gridlet),
set bl=/path/to/blastall
- Download Biogridlet .class and .prop files, edit Biogridlet.prop properties
to taste, especially QUERY selection.
Make sure Java 1.3+ runtime is available
- Find a query biosequence in fasta format to test. A sample query set is
java Biogridlet count=100 ldap://bio-mirror.net:3895/srv=srs out=query \
'query=(lib=genbank)(org=Anopheles gambiae)'
- Use Biogridlet to copy a databank subset to each node and run blast:
- node1:
java Biogridlet start=0 count=1000 | $bl/formatdb -i stdin -p F -o T -n databank1
$bl/blastall -p blastn -d databank1 -i query -m 8 -o databank1.out
- node2:
java Biogridlet start=1000 count=1000 | $bl/formatdb -i stdin -p F -o T -n databank2
$bl/blastall -p blastn -d databank2 -i query -m 8 -o databank2.out
- node3 .. n
- Copy blast results from each node and assemble to full result (yet to do; see NBLAST)
The runtime cost for this grid example, from a few quick tests,
is approximately the time it takes to run on one computer with a
full databank, divided by the number of nodes and subset
databanks you use.
Gridlet defined
From Jan K. Labanowski:
Computational Portals for Chemistry
Gridlets and XMLets
I found a new word gridlet in papers by Rajkumar Buyya and Manzur
Murshed from Monash University:
http://www.csse.monash.edu.au/~rajkumar/. By gridlet they
understand the tiny GridApp that contains all information related
to jobs and job execution management details such as jobs
processing requirements.
Note: this should become part of existing package iubio.grid used by
BioGridRunner
-- maybe subpackage iubio.grid.gridlet ?
Design
- Design for basic biogrid - partway between seti@home and globus grid methods
Data grid components - serve data via simple directory search/retrieval methods
Compute grid components - run applications, fetching data from Data grid components
Runner - client app used by biologist to select data, allocate cpu grid nodes,
run analysis, assemble results
Central/Home (?) - master directory services registry
for registering and identifying data and compute grid components
- Basic Components
Biogridlet - Compute-Grid component handler
Biogridrun - basic controller app for running list of available programs (start w/ BLAST?)
Biodirectory - data & software directory services - for Data-Grid servers
-- ldap, http?
Biogridhome - ? central directory for listing & using data & compute grid components
- Biogridlet:
-- ? split into 3 gridlets for cpu-node: node-manager, copy-url, run-app ?
... need simple execute() method to run programs
... need methods for handling Binary objects - gunzip for data
... possible use of java (.class,.jar), perl objects (script)
... should this become a 'screen-saver' type background app run on each compute node
with messaging to Biogridhome to keep node resources up to date?
... need to add these properties for biodata, and fiddle w/ query formats
(ldap, soap, http-cgi, others)
-- objectclass
-- start [Object]
-- count [nObjects]
-- format/fmt (? include w/ objectclass ?)
- Biodirectory:
-- basic directory node server (data + applications)
-- use javaldap now for simplicity
-- ldap-srs interface for SRS data providers.
Find here.
- Biogridrun:
-- select compute nodes from compute grid list known to Biogridhome
--- need parameters of node: os flavor, cpu, disk, memory available at mimimum
-- select application to run (for now from list provided by Biogridhome)
--- will include app binary url (no source compiles?)
-- select data from data grid, using search of data directories known to Biogridhome
-- foreach grid cpu node:
--- package up Biogridlet.class + message w/ app and data parameters
--- send to node & start Biogridlet on node
--- either poll node repeatedly, or have
- Biogridhome:
-- mainly a central directory server, with dynamic registration and updates of
available data, software and compute grid nodes, their resources and allowed
users and application running choices
-- Biogridrun interacts w/ this, gets referrals to other data, app and cpu directories
-- use existing bio name service,
ldap://bio-mirror.net/cn=Bioinformatics Name Service --- change name, or add new one
o=Biogrid ... ou=Bio,o=Grid ??
ldap://bio-mirror.net:3891/o=Bions
ldap://bio-mirror.net:3891/bc=Site,bc=Catalog,o=Bions
ldap://bio-mirror.net:3891/bc=Package,bc=Catalog,o=Bions
... start new subdirectory for this ... bc=
Notes
For now security & authentication will wait, as other globus-type components of grid.
Design for restricted list of applications that can be run.
@author d.gilbert, nov 2002, gilbertd@bio.indiana.edu
*/
public class Biogridlet
{
// properties keys
/** properties key: data directory url; must have */
public static final String URL = "Biogridlet.URL"; // == Context.PROVIDER_URL ?
/** url component */
public static final String PROTOCOL = "Biogridlet.PROTOCOL";
/** url component */
public static final String HOST = "Biogridlet.HOST";
/** url component */
public static final String PORT = "Biogridlet.PORT";
/** url component */
public static final String DN = "Biogridlet.DN";
/** url component */
public static final String PATH = "Biogridlet.PATH";
/** url component */
public static final String FILE = "Biogridlet.FILE";
/** url component */
public static final String REF = "Biogridlet.REF";
/** properties key: search scope, option, sub is default */
public static final String SCOPE = "Biogridlet.SCOPE";
/** properties key: query for databank, data field, etc. to search, ldap query syntax for now */
public static final String QUERY = "Biogridlet.QUERY";
/** properties key: start object number to retrieve from query result */
public static final String START = "Biogridlet.START";
/** properties key: number of objects to retrieve */
public static final String COUNT = "Biogridlet.COUNT";
/**
properties key: objectClass to search/retrieve
default * gets query summary, objectClasses should be defined in
http://iubio.bio.indiana.edu/biogrid/directories/schema/bioseq.schema
*/
public static final String OBJECT = "Biogridlet.OBJECT";
/**
properties key: result biosequence format
limited choices now: fasta, native (e.g. genbank, embl, swissprot, other biosequence formats)
*/
public static final String FORMAT = "Biogridlet.FORMAT";
/**
properties key: which object fields to return
* = all, others are defined in
http://iubio.bio.indiana.edu/biogrid/directories/schema/bioseq.schema
*/
public static final String ATTRIBUTES = "Biogridlet.ATTRIBUTES";
/** properties key: ldap extension controls, sizelimit=10, timelimit=1000 being useful */
public static final String EXTENSIONS = "Biogridlet.EXTENSIONS";
/** visible title for url */
public static final String TITLE = "Biogridlet.TITLE";
/** properties key: output control; listdn=false for no name */
public static final String LISTDN = "Biogridlet.LISTDN";
/** properties key: output control; listkey=false for no field key */
public static final String LISTKEY = "Biogridlet.LISTKEY";
/** properties key: output control; listkey=false for no field value */
public static final String LISTVAL = "Biogridlet.LISTVAL";
public static final String DEBUG = "Biogridlet.DEBUG";
/** properties key: output file, standard output is default */
public static final String OUTPUT = "Biogridlet.OUTPUT";
/** default properties file */
public static final String PROPERTIES= "Biogridlet.prop";
private DirContext dir;
private NamingEnumeration dirresults;
private Properties direnv;
//String basedn;
// get from env...
private int nrecs= 999999999; //Integer.MAXINT; //?
private boolean showdn= true, showkey= true, showval= true;
private boolean debug= false;
private boolean helping;
private PrintStream out= System.out;
private PrintStream err= System.err;
/**
run with Biogridlet.props properties and/or command-line options
*/
public static void main(String[] args)
{
String url= null, output= null, properties= Biogridlet.PROPERTIES;
Properties env= new Properties();
try { env.load(new FileInputStream(properties)); } catch (Exception ex1) {}
for (int i= 0; i0) {
key= arg.substring(m,e);
val= arg.substring(e+1);
}
else if (m>0) {
key= arg.substring(m,arg.length());
}
if (key==null) continue;
if (key.equalsIgnoreCase("h")||key.equalsIgnoreCase("help"))
new Biogridlet(env).usage();
if (key!=null && val!=null) env.put("Biogridlet."+key.toUpperCase(),val);
if ( key.startsWith("ldap://") )
url= key;
else if ( key.startsWith("http://") )
url= key;
else if (key.equals("p")||key.startsWith("pro")) { //properties
if (val==null) val= args[++i];
properties= val; //env.load(new FileInputStream(val));
try { env.load(new FileInputStream(properties)); }
catch (Exception ex) {}
}
else if (key.equals("u")||key.equals("url")) { //url
if (val==null) val= args[++i];
url= val;
}
else if (key.equals("o")||key.startsWith("out")) { //output
if (val==null) val= args[++i];
output= val;
env.setProperty(OUTPUT,output);
}
else { //?
//if (key!=null && val!=null) env.put("Biogridlet."+key.toUpperCase(),val);
}
}
Biogridlet bn= new Biogridlet(env);
boolean ok= bn.search(url);
if (ok) bn.retrieve();
else System.err.println("No results");
}
public Biogridlet() { this(new Properties()); }
public Biogridlet(Properties env) { setProperties(env); }
public void usage() {
out.println(" java "+getClass().getName()+" [ help | url | key=value | -p | -u ]");
out.println(" A basic gridlet client for Bio-data directory search and retrieval");
out.println(" -u url-to-search");
out.println(" -p proprties-file");
out.println(" key=value new property");
out.println(" Reads default properties from "+PROPERTIES);
direnv.list(err);
out.println();
//helping= true;
System.exit(0);
}
public void setProperties(Properties env) {
if (env==null) env= new Properties();
direnv= env;
debug = boolOf( direnv.getProperty(DEBUG,"false") );
}
public boolean search(String url)
{
Properties env= new Properties(direnv);
if (url==null || url.length()==0) url= env.getProperty(URL);
if (url!=null) env= parseUrl(url, env);
if (env.getProperty(URL)==null) usage();
if (debug) {
err.println("Search env:");
env.list(err);
}
if ("ldap".equals(env.getProperty(PROTOCOL)))
return ldapsearch(env);
else if ("http".equals(env.getProperty(PROTOCOL)))
return false; //websearch(env);
else
return false; //?
}
public void retrieve()
{
String output= direnv.getProperty(OUTPUT);
if (output!=null && output.length()>1) try {
out= new PrintStream( new FileOutputStream( output));
}
catch (Exception ex) {}
// ldapsearch -LLL equivalents...
showdn = boolOf( direnv.getProperty(LISTDN,"true") );
showkey= boolOf( direnv.getProperty(LISTKEY,"true") );
showval= boolOf( direnv.getProperty(LISTVAL,"true") );
int nitems= 0;
if (dirresults!=null)
try {
for (int ir= 0; ir < nrecs && dirresults.hasMore() ; ir++) {
SearchResult nc = (SearchResult) dirresults.next();
String name= nc.getName();
//v.addElement("dn: ");
//v.addElement(name);
//if (namelist!=null) namelist.add(name);
//if (saveattr!=null) saveattr.put("dn",name);
if (showdn) { out.print("dn: "); out.println(name); }
Attributes attrs= nc.getAttributes();
for (NamingEnumeration es= attrs.getAll(); es.hasMore() ; ) {
Attribute at= (Attribute) es.next();
String na= at.getID();
//if (saveattr!=null && saveattr.containsKey(na))
// saveattr.put(na,at.get());
if (showkey && !showval) out.println(na);
else for (NamingEnumeration ea= at.getAll(); ea.hasMore() ; ) {
Object va= ea.next();
//v.addElement(na); v.addElement(va.toString());
if (showval) {
if (showkey) out.print(na+": " );
// if binary .. handle
out.println(va);
}
}
}
if (showdn||showkey) out.println(); //? always
nitems++;
nc.setAttributes(null);
nc.setObject(null);
nc= null;
}
}
catch (Exception e) {
if (debug) err.println(getClass().getName()+".retrieve.ERROR: "+e.getMessage());
}
if (debug) { err.println("retrieved: "+nitems); }
out.close();
}
boolean ldapsearch(Properties env)
{
//String query= env.getProperty(QUERY);
//int e= query.lastIndexOf(")"); if (e<0) e= query.length();
StringBuffer qb= new StringBuffer();
if (env.getProperty(OBJECT)!=null)
qb.append("(objectClass="+env.getProperty(OBJECT)+")");
if (env.getProperty(START)!=null)
qb.append("(start="+env.getProperty(START)+")");
if (env.getProperty(COUNT)!=null)
qb.append("(count="+env.getProperty(COUNT)+")");
if (env.getProperty(FORMAT)!=null)
qb.append("(format="+env.getProperty(FORMAT)+")");
boolean doand= (qb.length()>0);
qb.append(env.getProperty(QUERY));
if (doand) { qb.insert(0,"(&"); qb.append(")"); }
if (debug) { System.err.println("ldap query: "+qb); }
return ldapsearch(env,
env.getProperty(URL),
env.getProperty(SCOPE),
qb.toString(),
splitString( env.getProperty(ATTRIBUTES)," ,;"),
splitString( env.getProperty(EXTENSIONS),"=,;"));
}
boolean ldapsearch( Properties env, String ldapurl, String scope, String filter,
String[] attr, String[] extn)
{
if (attr==null) attr= new String[0];
if (extn==null) extn= new String[0];
String sizelimit= null, timelimit= null, deref= null;
for (int i=0; i0) sc.setReturningAttributes(attr);
//? if (filter==null) filter="(objectClass=*)";
dir = (DirContext) new InitialDirContext(env);
dirresults = dir.search( basedn, filter, sc);
return (dirresults.hasMore());
}
catch (Exception e) {
if (debug) System.err.println(getClass().getName()+".ldapsearch.ERROR: "+e.getMessage());
// report error
return false;
}
}
private static final void hput(Hashtable h,String k,String v) {
if (v!=null) h.put(k,v); //else h.put(k,"");
}
public Properties parseUrl(String url, Properties h) {
if (h==null) h= new Properties();
if (url.startsWith("ldap"))
try {
// ldap://host:port/dn?attributes?scope?filter?extensions
com.sun.jndi.ldap.LdapURL lu= new com.sun.jndi.ldap.LdapURL(url);
//hput(h,"url",lu); //?
hput(h, URL,lu.toString());
hput(h, PROTOCOL,lu.getScheme());
hput(h, HOST,lu.getHost());
//int port= lu.getPort(); if (port==0) port= 389;
hput(h, PORT,String.valueOf(lu.getPort()));
hput(h, DN,lu.getDN());
hput(h, ATTRIBUTES,lu.getAttributes());
hput(h, SCOPE,lu.getScope());
hput(h, QUERY,lu.getFilter());
hput(h, EXTENSIONS,lu.getExtensions());
String t= lu.getScheme()+ ":/"+ lu.getHost() + "/"+ lu.getDN();
hput(h, TITLE,t);
return h;
}
catch (Exception ex) { }
try {
URL lu= new URL(url); // java.net.MalformedURLException: unknown protocol: ldap
hput(h,URL,lu.toString());
hput(h,PROTOCOL,lu.getProtocol());
hput(h,HOST,lu.getHost());
hput(h,PORT,String.valueOf(lu.getPort()));
hput(h,PATH,lu.getPath());
hput(h,FILE,lu.getFile());
hput(h,REF,lu.getRef());
hput(h,QUERY,lu.getQuery());
String t;
if ( lu.getHost()==null || "localhost".equals(lu.getHost()) )
t= lu.getFile();
else
t= lu.getProtocol()+ ":/"+ lu.getHost() + lu.getFile();
hput(h,TITLE,t);
}
catch (Exception e) {
hput(h,URL,url);
int c= url.indexOf(":");
if (c>0) hput(h,PROTOCOL,url.substring(0,c));
else hput(h,PROTOCOL, "unknown");
hput(h,TITLE,url);
}
return h;
}
private static final boolean boolOf(Object val) {
String b= String.valueOf(val);
return "true".equalsIgnoreCase(b) || "1".equals(b)
|| "on".equalsIgnoreCase(b) || "yes".equalsIgnoreCase(b);
}
private static String[] splitString(String s, String del) {
if (s==null) return new String[0];
StringTokenizer st= new StringTokenizer(s, del);
int n= st.countTokens(); String[] ss= new String[n];
for (int i=0; i