# bioseq4.schema # ldap schema for bio-sequence databanks in Bio-Mirror project # minimal info common to many, or needed for some # version 0.4, dec02, d.gilbert ## ?? make this schema self-contained? (no refs to core.schema, other ldap schema) ## grep -- '-oid' bioseq4.schema |\ ## perl -e 'while(<>){if (/([\w\-]+-oid)/) \ ## {$o= $1;unless($h{$o}++){ $n++;print "objectidentifier $o 1953.$n\n";}}}' \ ## > ! bioseq4.oids # Note: as much as possible, match attribute name to SRS databank field names # to simplify srs2ldap: need mapping anyway, but LDAP enforces use of # primary schema-defined names in query filters (substituting alternate for primary) ## ldap Syntax Object Identifiers #binary: 1.3.6.1.4.1.1466.115.121.1.5 #integer:1.3.6.1.4.1.1466.115.121.1.27 #directory string: 1.3.6.1.4.1.1466.115.121.1.15 #printable string: 1.3.6.1.4.1.1466.115.121.1.44 #numeric string: 1.3.6.1.4.1.1466.115.121.1.36 #octet string: 1.3.6.1.4.1.1466.115.121.1.40 ## to some extent this schema should be auto-generated from [srs] databanks/backend ## need to add srs fields or equivalents to handle ## common ones for bioseqs: acc, des, gen(geneName), div ## key, (dat/crd/crlu), cc ## refs: aut, tit, jnl, vol, fp, yr, mid, pmd, rc. # dbn = DbName == lib?? # dbxref == dr for swissprot ## new schema format for openldap 2.1.x objectIdentifier BioMirrorRoot 1.129.79.225200 objectIdentifier BioSeq BioMirrorRoot:1 objectIdentifier BioSeqattributeType BioSeq:3 objectIdentifier BioSeqObjectClass BioSeq:4 # = srs field attributetype ( BioSeqattributeType:1 NAME ( 'kw' 'key' 'Keywords' ) DESC 'Keywords' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # = srs field attributetype ( BioSeqattributeType:2 NAME ( 'gen' 'GeneName' ) DESC 'Gene name' EQUALITY caseExactMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # = srs field attributetype ( BioSeqattributeType:3 NAME ( 'cc' 'Comment' ) DESC 'Comment' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:4 NAME ( 'cr' 'crc' ) DESC 'CRC value' EQUALITY caseIgnoreMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:5 NAME ( 'fl' 'file' ) DESC 'Databank file' EQUALITY caseExactMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # = srs library key attributetype ( BioSeqattributeType:6 NAME ( 'lib' 'Library' ) DESC 'Databank library' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:6.1 NAME ( 'dir' 'Directory' ) DESC 'Biodata Directory' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:6.2 NAME ( 'srv' 'service' ) DESC 'Bioinformatics Service' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # ? use this or use generic 'category' attribute attributetype ( BioSeqattributeType:7 NAME ( 'libgroup' 'LibraryGroup' ) DESC 'Databank library category' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:8 NAME ( 'field' 'DatabankField' ) DESC 'Databank Field' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:9 NAME ( 'link' 'DatabankLink' ) DESC 'Linked databank' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # = srs field attributetype ( BioSeqattributeType:10 NAME ( 'div' 'Divison' ) DESC 'Databank division' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:11 NAME ( 'count' ) DESC 'count of items' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) attributetype ( BioSeqattributeType:12 NAME ( 'in' 'index' ) DESC 'index of items' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) attributetype ( BioSeqattributeType:13 NAME ( 'start' 'Startitem' ) DESC 'start of items' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) attributetype ( BioSeqattributeType:13.1 NAME ( 'query' 'filter' ) DESC 'Query string' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) #attributetype ( BioSeqattributeType:14 # NAME 'data-ref' # DESC 'referral to ldap://site-info containing data' # EQUALITY caseIgnoreMatch # SUBSTR caseIgnoreSubstringsMatch # SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) #was ref, ldap v2.1.9 won't allow !?? # SUP ref ## add from gnomap2.schema for gnomap ldif data attributetype ( BioSeqattributeType:15 NAME ( 'chr' 'Chromosome' ) DESC 'Chromosome' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:16 NAME ( 'ft' 'ftk' 'Feature' ) DESC 'Sequence feature' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:17 NAME ( 'bpb' 'bstart' 'Base-start' ) DESC 'start base number of feature' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) attributetype ( BioSeqattributeType:18 NAME ( 'bpe' 'bstop' 'Base-stop' ) DESC 'last base number of feature' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) ## in biomirror-catalog-spec.schema attributetype ( BioSeqattributeType:19 NAME 'id' DESC 'object id' EQUALITY caseExactMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE ) ## in biomirror-catalog-spec.schema attributetype ( BioSeqattributeType:20 NAME ( 'fmt' 'format' ) DESC 'data format' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # = srs field attributetype ( BioSeqattributeType:21 NAME ('ac' 'acc' ) DESC 'accession' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # same as gnomap.schema # org is srs field, make it primary name ?? attributetype ( BioSeqattributeType:22 NAME ( 'os' 'org' 'spp' 'Species' ) DESC 'Species' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) ## need to fix this to parsable date - number like 20020130 ## crlu == swissprot/trembl/embl date updated ## do we need a different attribute for each date subtype ? attributetype ( BioSeqattributeType:23 NAME ( 'dt' 'dat' 'lsu' 'lau' 'crlu' 'crd' 'Date' ) DESC 'Date' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) ## in biomirror-catalog-spec.schema ## replace with core labeledURI attr attributetype ( BioSeqattributeType:24 NAME 'url' DESC 'universal resource locator' SUP labeledURI EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch ) attributetype ( BioSeqattributeType:25 NAME 'web' DESC 'web page' SUP labeledURI ) attributetype ( BioSeqattributeType:25.1 NAME ('taxon' 'taxonomy' ) DESC 'taxonomic or phylogenetic classification' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # ? use standard Definition attribute # = srs field attributetype ( BioSeqattributeType:26 NAME ( 'de' 'des' ) DESC 'Description' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) ## in gnomap schema # srs field names vary (dr, ??) attributetype ( BioSeqattributeType:27 NAME ( 'dr' 'dbxref' ) DESC 'Database cross-reference' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) ## in gnomap schema attributetype ( BioSeqattributeType:28 NAME ( 'con' 'content' ) DESC 'data content types' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) ## should be numeric value # = srs field attributetype ( BioSeqattributeType:29 NAME ( 'sl' 'Seqlength' ) DESC 'length of sequence' EQUALITY integerMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.27 ) # this should be an enum: # = srs field attributetype ( BioSeqattributeType:30 NAME 'mol' DESC 'Molecule type' EQUALITY caseIgnoreMatch SUBSTR caseIgnoreSubstringsMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) # na,bna,aa,baa are in gnomap.schema # ? use mol type and only one seq attr for na/aa ? attributetype ( BioSeqattributeType:31 NAME ( 'sq' 'seq' 'na' 'nucleic-acid' ) DESC 'sequence of nucleic acid data' EQUALITY caseIgnoreMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:32 NAME ( 'bseq' 'bna' 'binary-nucleic-acid' ) DESC 'binary coded sequence of nucleic acid data' SYNTAX 1.3.6.1.4.1.1466.115.121.1.5 ) # SUP nucleic-acid attributetype ( BioSeqattributeType:33 NAME ( 'aa' 'amino-acid' ) DESC 'sequence of amino acid data' EQUALITY caseIgnoreMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:33.1 NAME ( 'baa' 'binary-amino-acid' ) DESC 'binary coded sequence of amino acid data' SYNTAX 1.3.6.1.4.1.1466.115.121.1.5 ) # SUP amino-acid attributetype ( BioSeqattributeType:34 NAME ( 'drec' 'data-record' ) DESC 'biology data record (native format)' EQUALITY caseIgnoreMatch SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 ) attributetype ( BioSeqattributeType:35 NAME ( 'brec' 'compressed-data-record' ) DESC 'gzip compressed data record' SYNTAX 1.3.6.1.4.1.1466.115.121.1.5 ) # SUP data-record # objects ------------ objectClass ( BioSeqObjectClass:1 NAME ( 'biodir' 'BioseqDirectory' ) DESC 'Biosequence Data directory' SUP top MUST ( id ) MAY ( cn $ name $ lib $ des $ url $ web $ con ) ) # this is object for directory lib attribute (id == directory.lib ?) objectClass ( BioSeqObjectClass:2 NAME 'BioseqDatabank' DESC 'Biosequence Databank' SUP top MUST ( id ) MAY ( cn $ name $ lib $ des $ url $ web $ con $ fmt ) ) # add some set range info: start,length ; also flavors for Entry/Record/BinRecord ? objectClass ( BioSeqObjectClass:3 NAME 'BioseqSet' DESC 'Set of Biosequence Entries' SUP top MUST ( id ) MAY ( cn $ name $ lib $ des $ url $ web $ con $ fmt $ fmt $ cc $ key $ des $ dat $ spp ) ) ## SUP BioseqDatabank # data-ref $ ref objectClass ( BioSeqObjectClass:4 NAME 'BiodataFile' DESC 'File of Biodatabank Records' SUP top MUST ( id ) MAY ( cn $ name $ lib $ des $ url $ web $ con $ fmt $ cc $ key $ dat $ spp ) ) # SUP BioseqDatabank # data-ref $ ref # general data object - nonsequence - SUP for Bioseq? # should also be extensibleObject #objectclass ( BioSeqObjectClass:5 NAME 'Biodata' # DESC 'Biodata Entry' # MUST ( id ) # MAY ( data-ref $ ref $ fmt $ name $ cc $ key $ lib $ des $ dat $ spp ) # ) objectClass ( BioSeqObjectClass:6 NAME 'BiodataRecord' DESC 'Biodata Record' MUST ( id ) MAY ( cn $ name $ acc $ cc $ key $ lib $ des $ format $ Date $ Species $ file $ crc $ index $ drec $ brec ) ) # $ data-ref $ ref #objectclass ( BioSeqObjectClass:7 NAME 'BioseqEntry' # DESC 'Biosequence Entry' # MUST ( id $ acc ) # MAY ( fmt $ cc $ div $ gen $ key $ lib $ des $ dat $ mol $ sl $ spp ) # ) # #objectclass ( BioSeqObjectClass:8 NAME 'BioseqRecord' # DESC 'Biosequence Record' # SUP BioseqEntry # MAY ( seq ) # ) # #objectclass ( BioSeqObjectClass:9 NAME 'BinaryBioseqRecord' # DESC 'Binary encoded Biosequence Record' # SUP BioseqEntry # MAY ( bseq ) # ) # ### not same as gnomap.schema --- are these objects as well as attributes? ### optional ascii or binary attribute ? ### can we do MUST ( na | bna ) ?? #objectclass ( BioSeqObjectClass:10 NAME 'NA-sequence' # DESC 'Nucleic acid sequence record' # SUP BioseqEntry # MAY ( na $ bna ) # ) # #objectclass ( BioSeqObjectClass:11 NAME 'AA-sequence' # DESC 'Amino acid sequence record' # SUP BioseqEntry # MAY ( aa $ baa ) # )