This is a difference file for the WAIS source, version 8b5, as available via ftp to ftp.think.com. It includes changes for boolean operators, literal string matches, partial word matches, and biology specific needs. See the ftp/gopher archive at ftp.bio.indiana.edu, /util/wais/iubio* for more information. At least a couple of problems remain: The wais-8b5 code will not properly index words that occur more than 32,000 times. The Genbank biology databank that I serve here has > 80,000 records, and some important index words occur more than this limit. The wais release 8b3 was amenable to uping this limit, but not the w8b5. I'll continue to look for this problem. The partial word matching addition of mine is imperfect -- it misses some words that should match. Anyone who can look into this and help out, I'd appreciate it. I'll also work on this as time permits. You can apply this patch with Larry Wall's patch program (available at many unix ftp sites), in this way, at the directory containing gopher1.01/ % patch -p < bintypes.patch -- Don Gilbert (gilbertd@bio.indiana.edu) diff -bwcr wais-8-b5/Makefile w8b5bio/Makefile *** wais-8-b5/Makefile Mon Oct 12 12:13:10 1992 --- w8b5bio/Makefile Wed Oct 28 08:44:40 1992 *************** *** 41,47 **** RANLIB = ranlib # on IBM RS6000 this should be c89. ! CC = cc # set this for your site. This syntax only works in SunOS # for other UNIX-like OS's set this to this directory. --- 41,48 ---- RANLIB = ranlib # on IBM RS6000 this should be c89. ! #CC = cc ! CC = gcc # set this for your site. This syntax only works in SunOS # for other UNIX-like OS's set this to this directory. *************** *** 61,71 **** # for relevance feedaback in the search engine, add -DRELEVANCE_FEEDBACK # USG for Unix Dirent in lib # for SGIs running IRIX 4.0.1, add -cckr ! CFLAGS = -g -I$(SUPDIR) -DSECURE_SERVER -DRELEVANCE_FEEDBACK -DUSG MAKE = make -k ! default: config.h lib ir ui bin doc x tags @echo "Welcome to WAIS" config.h: config --- 62,80 ---- # for relevance feedaback in the search engine, add -DRELEVANCE_FEEDBACK # USG for Unix Dirent in lib # for SGIs running IRIX 4.0.1, add -cckr ! # dgg additions ! # LITERAL == waisserver, search for "literal strings" ! # BOOLEANS == waisserver, search with boolean AND, NOT operators ! # PARTIALWORD == waisserver, search for partial words, hum* matches human, hummingbird, ... ! # BIO == waisindex, waisserver changes including symbol indexing & search & bio data formats ! # ! #CFLAGS = -g -I$(SUPDIR) -DSECURE_SERVER -DRELEVANCE_FEEDBACK -DUSG ! CFLAGS = -g -I$(SUPDIR) -DSECURE_SERVER -DRELEVANCE_FEEDBACK -DUSG -DBIO -DBOOLEANS -DPARTIALWORD -DLITERAL MAKE = make -k ! #default: config.h lib ir ui bin doc x tags ! default: config.h lib ir ui bin doc x @echo "Welcome to WAIS" config.h: config Common subdirectories: wais-8-b5/bin and w8b5bio/bin Common subdirectories: wais-8-b5/doc and w8b5bio/doc Common subdirectories: wais-8-b5/ir and w8b5bio/ir Common subdirectories: wais-8-b5/lib and w8b5bio/lib Common subdirectories: wais-8-b5/ui and w8b5bio/ui Common subdirectories: wais-8-b5/wais-sources and w8b5bio/wais-sources Common subdirectories: wais-8-b5/x and w8b5bio/x diff -bwcr wais-8-b5/doc/Makefile w8b5bio/doc/Makefile *** wais-8-b5/doc/Makefile Fri Nov 22 18:27:26 1991 --- w8b5bio/doc/Makefile Sat Oct 24 11:22:55 1992 *************** *** 36,41 **** --- 36,42 ---- ../wais-sources/wais-docs.dct: $(text) ../bin/waisindex -d ../wais-sources/wais-docs -t text *.txt + ../bin/waisindex -d ../wais-sources/test -t bio *.bio clean: $(RM) *~ Common subdirectories: wais-8-b5/doc/manl and w8b5bio/doc/manl Only in w8b5bio/doc: stop.test Only in w8b5bio/doc: test.bio diff -bwcr wais-8-b5/ir/Makefile w8b5bio/ir/Makefile *** wais-8-b5/ir/Makefile Sun May 10 16:42:23 1992 --- w8b5bio/ir/Makefile Sun Nov 8 19:25:02 1992 *************** *** 173,179 **** # C Compiler. Use either cc or gcc. Comment this out in release to # inherit from top level. ! CC = cc # Compiler & linker flags. # Compiler debug flag. Use -g for debugging, -O for optimization. --- 173,180 ---- # C Compiler. Use either cc or gcc. Comment this out in release to # inherit from top level. ! #CC = cc ! CC = gcc # Compiler & linker flags. # Compiler debug flag. Use -g for debugging, -O for optimization. *************** *** 181,187 **** # for System V add -DSYSV # for XENIX add -M3e -Zi # -DSECURE_SERVER for waisserver to setuid to uucp after startup. ! CFLAGS = -g -DSECURE_SERVER -DRELEVANCE_FEEDBACK # dealing with taking an intenet name and resolving it. # set to -lresolv or nothing --- 182,196 ---- # for System V add -DSYSV # for XENIX add -M3e -Zi # -DSECURE_SERVER for waisserver to setuid to uucp after startup. ! # ! # -- dgg additions for biology data ! # -DPARTIALWORD (waisserver) for partial word matches, hum* matches human, hummingbird, ... ! # -DBOOLEANS (waisserver) for boolean AND, NOT patches (dgg) ! # -DLITERAL (waisserver) search for "literal strings" (dgg) ! # -DBIO for biology source patches, including symbol usage (dgg, 12oct92) ! # ! CFLAGS = -g -DSECURE_SERVER -DRELEVANCE_FEEDBACK -DBIO -DBOOLEANS -DPARTIALWORD -DLITERAL ! #CFLAGS = -g -DSECURE_SERVER -DRELEVANCE_FEEDBACK # dealing with taking an intenet name and resolving it. # set to -lresolv or nothing diff -bwcr wais-8-b5/ir/cutil.c w8b5bio/ir/cutil.c *** wais-8-b5/ir/cutil.c Thu Apr 30 11:03:35 1992 --- w8b5bio/ir/cutil.c Sat Oct 31 13:09:27 1992 *************** *** 288,299 **** /*----------------------------------------------------------------------*/ ! typedef long (longfunc) _AP((long c)); char* strtokf(s1,isDelimiter) char* s1; ! longfunc *isDelimiter; /* really *isDelimiter() */ /* This function is exactly like strtok, except that instead of passing a delimiter string, you pass a function that decides if a character is --- 288,324 ---- /*----------------------------------------------------------------------*/ ! boolean ! wordbreak_notalnum(ch) /* dgg */ ! long ch; ! { ! return( !isalnum(ch)); ! } + boolean + wordbreak_notgraph(ch) /* dgg */ + long ch; + { + return( !isgraph(ch)); + } + + boolean wordbreak_user(c) + long c; + { + if ( (gDelimiters[0] != '\0') && (strchr(gDelimiters, (char) c) != NULL) ) + return(IS_DELIMITER); + else if (isgraph(c)) + return(NOT_DELIMITER); + else + return(IS_DELIMITER); + } + + typedef boolean (boolfunc) _AP((long c)); + char* strtokf(s1,isDelimiter) char* s1; ! boolfunc *isDelimiter; /* really *isDelimiter() */ /* This function is exactly like strtok, except that instead of passing a delimiter string, you pass a function that decides if a character is *************** *** 303,309 **** */ { static char* searchStr = NULL; ! static longfunc *delimiterFunc; long i; char* startTok = NULL; --- 328,334 ---- */ { static char* searchStr = NULL; ! static boolfunc *delimiterFunc; long i; char* startTok = NULL; diff -bwcr wais-8-b5/ir/cutil.h w8b5bio/ir/cutil.h *** wais-8-b5/ir/cutil.h Sat Mar 7 22:40:03 1992 --- w8b5bio/ir/cutil.h Sat Oct 31 19:55:32 1992 *************** *** 39,44 **** --- 39,45 ---- #endif #define MAX_FILENAME_LEN 255 + #define MAX_DELIMITERS 256 #ifdef ANSI_LIKE #ifndef EXIT_SUCCESS /* only include it if not already included */ *************** *** 122,132 **** char* s_strdup _AP((char* s)); ! char* strtokf _AP((char* s1,long (*isDelimiter)(long c))); char* strtokf_isalnum _AP((char* s1)); ! #define IS_DELIMITER 1 ! #define NOT_DELIMITER !IS_DELIMITER #ifdef ANSI_LIKE /* use ansi */ long cprintf _AP((boolean print,char* format,...)); --- 123,137 ---- char* s_strdup _AP((char* s)); ! boolean wordbreak_notalnum _AP(( long ch)); /* dgg */ ! boolean wordbreak_notgraph _AP(( long ch)); /* dgg */ ! boolean wordbreak_user _AP(( long ch)); /* dgg, uses gDelimiters */ ! ! char* strtokf _AP((char* s1,boolean (*isDelimiter)(long c))); char* strtokf_isalnum _AP((char* s1)); ! #define IS_DELIMITER true ! #define NOT_DELIMITER false #ifdef ANSI_LIKE /* use ansi */ long cprintf _AP((boolean print,char* format,...)); *************** *** 182,191 **** --- 187,198 ---- long wais_pid = 0; long log_line = 0; long wais_log_level = 10; + char gDelimiters[MAX_DELIMITERS]; /* dgg */ #else extern long wais_pid; extern long log_line; extern wais_log_level; + extern char gDelimiters[]; /* dgg */ #endif /* _C_C_util_ #ifdef __cplusplus diff -bwcr wais-8-b5/ir/irbuild.c w8b5bio/ir/irbuild.c *** wais-8-b5/ir/irbuild.c Sun May 10 16:48:16 1992 --- w8b5bio/ir/irbuild.c Sun Nov 8 18:59:38 1992 *************** *** 97,104 **** --- 97,109 ---- #include "ircfiles.h" #include "version.h" #include "irext.h" + #include "stoplist.h" /* dgg */ + #ifdef BIO + #define INDEXER_DATE "31 Oct 1992, bio patch" + #else #define INDEXER_DATE "Sun May 10 1992" + #endif /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */ *************** *** 125,130 **** --- 130,139 ---- fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n"); fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n"); fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n"); + #ifdef BIO + fprintf(stderr," [-stop stoplist_filename] /* file of common words to ignore */\n"); + fprintf(stderr," [-delim delimiters] /* list of word delimiter symbols */\n"); + #endif fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n"); fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n"); fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n"); *************** *** 158,163 **** --- 167,184 ---- fprintf(stderr," | objc /* objective-C .h and .m files */\n"); #endif /* def NeXT */ fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n"); + #ifdef BIO + fprintf(stderr," | genbank /* GenBank flatfile format */\n"); + fprintf(stderr," | embl /* EMBL flatfile format */\n"); + fprintf(stderr," | pir /* PIR flatfile format */\n"); + fprintf(stderr," | prositedoc /* Prosite protein doc format */\n"); + fprintf(stderr," | prositedat /* Prosite protein dat format */\n"); + fprintf(stderr," | biojournal /* Bio journal TOC on bionet.journals */\n"); + fprintf(stderr," | redbook /* Drosophila redbook text */\n"); + fprintf(stderr," | flybase /* Drosophila Ashburner data files */\n"); + fprintf(stderr," | flystock /* Drosophila stock lists */\n"); + fprintf(stderr," | din /* Drosophila Info. Newsletter */\n"); + #endif fprintf(stderr," ] filename filename ...\n"); } *************** *** 166,171 **** --- 187,193 ---- extern boolean index_contents; + /* This is the MAIN for building an index. */ void *************** *** 178,187 **** char **argv_copy = argv; char *next_argument; char index_filename[1000]; - boolean (*separator_function)(); - void (*header_function)(); - void (*finish_header_function)(); - long (*date_function)(); boolean adding_to_existing_index = false; boolean traverse_directory = false; boolean word_positions = false; --- 200,205 ---- *************** *** 197,216 **** boolean make_catalog = true; char data_filename[MAXPATHLEN]; char *typename = NULL; /* this is what the user said */ - char *type = NULL; /* this is the type stored with the db */ long start_of_filenames; long hashtable_size = 1L<<16; long flush_after_n_words = 300000; char *command_name; ! next_argument = next_arg(&argc, &argv); ! separator_function = NULL; /* initailize to nil */ header_function = NULL; date_function = NULL; finish_header_function = NULL; ! type = "TEXT"; /* default to text */ typename = "Text"; command_name = next_argument; logfile = stderr; --- 215,261 ---- boolean make_catalog = true; char data_filename[MAXPATHLEN]; char *typename = NULL; /* this is what the user said */ long start_of_filenames; long hashtable_size = 1L<<16; long flush_after_n_words = 300000; char *command_name; ! dataopsrec dataops; ! /*------------- these go into dataops ! boolean (*separator_function)(); ! void (*header_function)(); ! void (*finish_header_function)(); ! long (*date_function)(); ! char *type = NULL; ! int minwordlen= 2; ! ---------------*/ ! ! /* dgg -- put all of these separate, datatype-specific functions & params into a record! */ ! gDelimiters[0]= '\0'; /* <-- bombs ?? */ ! dataops.separator_function= NULL; ! dataops.header_function= NULL; ! dataops.date_function= NULL; ! dataops.finish_header_function= NULL; ! dataops.type= "TEXT"; ! dataops.addseparatorwords= false; ! dataops.extraheaderweight= true; ! dataops.repeat_weight= 1; ! dataops.minwordlen= 2; ! dataops.wordDelimiter= wordbreak_notalnum; ! dataops.delimiters= gDelimiters; ! wordDelimiter= wordbreak_notalnum; ! ! /*------ ! separator_function = NULL; header_function = NULL; date_function = NULL; finish_header_function = NULL; ! type = "TEXT"; typename = "Text"; + -------*/ + + next_argument = next_arg(&argc, &argv); command_name = next_argument; logfile = stderr; *************** *** 226,231 **** --- 271,277 ---- #else strcpy(index_filename, "index"); /* in the current directory */ #endif /* THINK_C */ + stop_list_file("\0"); /* dgg */ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"No arguments specified\n"); *************** *** 241,246 **** --- 287,309 ---- } strcpy(index_filename, next_argument); } + #ifdef BIO + else if (0 == strcmp("-stop", next_argument)){ /* dgg, stoplist file */ + if (NULL == (next_argument = next_arg(&argc, &argv))){ + fprintf(stderr,"Expected filename for the stoplist\n"); + exit(0); + } + stop_list_file(next_argument); + } + else if (0 == strcmp("-delim", next_argument)){ /* dgg, delimiters */ + if (NULL == (next_argument = next_arg(&argc, &argv))){ + fprintf(stderr,"Expected the delimiters argument\n"); + exit(0); + } + strcpy(gDelimiters, next_argument); + } + #endif + else if(0 == strcmp("-a", next_argument)){ adding_to_existing_index = true; } *************** *** 254,260 **** export_database = true; } else if(0 == strcmp("-v", next_argument)){ ! fprintf(stderr,"%s: %s\n", command_name, VERSION, INDEXER_DATE); } else if (0 == strcmp("-stdin", next_argument)) { read_files_from_stdin = true; --- 317,323 ---- export_database = true; } else if(0 == strcmp("-v", next_argument)){ ! fprintf(stderr,"%s: %s %s\n", command_name, VERSION, INDEXER_DATE); } else if (0 == strcmp("-stdin", next_argument)) { read_files_from_stdin = true; *************** *** 328,335 **** if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); typename = next_argument; ! type = next_argument; ! finish_header_function = filename_finish_header_function; } else if(0 == strcmp("-contents", next_argument)){ index_contents = true; --- 391,398 ---- if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); typename = next_argument; ! dataops.type = next_argument; ! dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("-contents", next_argument)){ index_contents = true; *************** *** 344,597 **** panic("Expected a file type"); if(0 == strcmp("groliers", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = groliers_separator_function; ! header_function = groliers_header_function; ! finish_header_function = groliers_finish_header_function; } #ifdef NeXT else if(0 == strcmp("objc", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = wobjc_separator_function; ! header_function = wobjc_header_function; ! finish_header_function = wobjc_finish_header_function; } #endif /* def NeXT */ else if(0 == strcmp("mail", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = mail_separator_function; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_or_rmail", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = mail_or_rmail_separator; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_digest", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = mail_digest_separator_function; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mh_bboard", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = mh_bboard_separator_function; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rmail", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = rmail_separator_function; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("netnews", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = NULL; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rn", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = rn_separator_function; ! header_function = mail_header_function; ! date_function = mail_date_function; ! finish_header_function = mail_finish_header_function; } else if(0 == strcmp("emacsinfo", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = emacs_info_separator_function; ! header_function = emacs_info_header_function; ! finish_header_function = emacs_info_finish_header_function; } else if(0 == strcmp("catalog", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = catalog_separator_function; ! header_function = catalog_header_function; ! finish_header_function = catalog_finish_header_function; } else if(0 == strcmp("bio", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = bio_separator_function; ! header_function = bio_header_function; ! finish_header_function = bio_finish_header_function; } else if(0 == strcmp("cmapp", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = cmapp_separator_function; ! header_function = cmapp_header_function; ! finish_header_function = cmapp_finish_header_function; } else if(0 == strcmp("ftp", next_argument)){ ! type = "TEXT-FTP"; typename = next_argument; ! separator_function = first_line_separator_function; ! header_function = first_line_header_function; ! finish_header_function = first_line_finish_header_function; } else if(0 == strcmp("jargon", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = jargon_separator_function; ! header_function = jargon_header_function; ! finish_header_function = jargon_finish_header_function; } else if(0 == strcmp("server", next_argument)){ typename = next_argument; ! type = "WSRC"; ! finish_header_function = filename_finish_header_function; } else if(0 == strcmp("text", next_argument)){ ! type = "TEXT"; typename = next_argument; check_for_text_file = true; } else if(0 == strcmp("filename", next_argument)){ ! type = "TEXT"; typename = next_argument; ! finish_header_function = filename_finish_header_function; } else if(0 == strcmp("irg", next_argument)){ typename = next_argument; ! type = "TEXT"; ! separator_function = irg_separator_function; ! header_function = irg_header_function; ! finish_header_function = irg_finish_header_function; } /* dash-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("dash", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = dash_separator_function; ! header_function = dash_header_function; ! finish_header_function = dash_finish_header_function; } /* one_line-separated items */ else if(0 == strcmp("one_line", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = one_line_separator_function; ! header_function = one_line_header_function; ! finish_header_function = one_line_finish_header_function; } /* blank line-separated items (paragraphs) */ else if(0 == strcmp("para", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = para_separator_function; ! header_function = para_header_function; ! finish_header_function = para_finish_header_function; } /* seeker items */ else if(0 == strcmp("seeker", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = seeker_separator_function; ! header_function = seeker_header_function; ! finish_header_function = seeker_finish_header_function; } /* medline format */ else if(0 == strcmp("medline", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = medline_separator_function; ! header_function = medline_header_function; ! finish_header_function = medline_finish_header_function; } /* refer format */ else if(0 == strcmp("refer", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = refer_separator_function; ! header_function = refer_header_function; ! finish_header_function = refer_finish_header_function; } /* first_line format */ else if(0 == strcmp("first_line", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = first_line_separator_function; ! header_function = first_line_header_function; ! finish_header_function = first_line_finish_header_function; } /* rlin items */ else if(0 == strcmp("rlin", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = rlin_separator_function; ! header_function = rlin_header_function; ! finish_header_function = rlin_finish_header_function; } else if(0 == strcmp("dvi", next_argument)){ typename = next_argument; ! type = "DVI"; ! finish_header_function = filename_finish_header_function; } else if(0 == strcmp("ps", next_argument)){ typename = next_argument; ! type = "PS"; ! finish_header_function = filename_finish_header_function; } else if(0 == strcmp("pict", next_argument)){ typename = next_argument; ! type = "PICT"; ! finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("gif", next_argument)){ typename = next_argument; ! type = "GIF"; ! finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("tiff", next_argument)){ typename = next_argument; ! type = "TIFF"; ! finish_header_function = filename_finish_header_function; index_contents = false; } /* BibTeX items */ else if(0 == strcmp("bibtex", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = bibtex_separator_function; ! header_function = bibtex_header_function; ! finish_header_function = bibtex_finish_header_function; } /* ?:? seperated hypertext items */ else if(0 == strcmp("nhyp", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = nhyp_separator_function; ! header_function = nhyp_header_function; ! finish_header_function = nhyp_finish_header_function; } else if(0 == strcmp("ziff", next_argument)){ ! type = "TEXT"; typename = next_argument; ! separator_function = ziff_separator_function; ! header_function = ziff_header_function; ! finish_header_function = ziff_finish_header_function; } else{ panic("Don't recognize the '%s' type", next_argument); --- 407,803 ---- panic("Expected a file type"); if(0 == strcmp("groliers", next_argument)){ typename = next_argument; ! dataops.type ="TEXT"; ! dataops.separator_function = groliers_separator_function; ! dataops.header_function = groliers_header_function; ! dataops.finish_header_function = groliers_finish_header_function; } + + #ifdef BIO + else if(0 == strcmp("genbank", next_argument)){/* dgg */ + typename = next_argument; + dataops.type ="TEXT"; + dataops.separator_function = genbank_separator_function; + dataops.header_function = genbank_header_function; + dataops.finish_header_function = genbank_finish_header_function; + dataops.date_function = genbank_date_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + dataops.minwordlen= 2; + } + else if(0 == strcmp("embl", next_argument)){/* dgg */ + typename = next_argument; + dataops.type ="TEXT"; + dataops.separator_function = embl_separator_function; + dataops.header_function = embl_header_function; + dataops.finish_header_function = embl_finish_header_function; + dataops.date_function = embl_date_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + } + else if(0 == strcmp("pir", next_argument)){/* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = pir_separator_function; + dataops.header_function = pir_header_function; + dataops.finish_header_function = pir_finish_header_function; + dataops.date_function = pir_date_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + } + else if(0 == strcmp("prositedoc", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = prositedoc_separator_function; + dataops.header_function = prositedoc_header_function; + dataops.finish_header_function = prositedoc_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + } + else if(0 == strcmp("prositedat", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = prositedat_separator_function; + dataops.header_function = prositedat_header_function; + dataops.finish_header_function = prositedat_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + } + else if(0 == strcmp("biojournal", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = biojournal_separator_function; + dataops.header_function = biojournal_header_function; + dataops.finish_header_function = biojournal_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + } + + else if(0 == strcmp("redbook", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = redbook_separator_function; + dataops.header_function = redbook_header_function; + dataops.finish_header_function = redbook_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + dataops.wordDelimiter= wordbreak_user; /* redbook_delimiter; */ + wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ + dataops.minwordlen= 1; + if (gDelimiters[0] == '\0') strcpy( gDelimiters, "/{}()[]%-:#.~*\";,|"); + } + else if(0 == strcmp("flybase", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = flybase_separator_function; + dataops.header_function = flybase_header_function; + dataops.finish_header_function = flybase_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */ + wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ + dataops.minwordlen= 1; + if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|"); + + /* flybase symbols + valid data ()$+-?;.\' + possible data and delimiter |;[]-?.~ + delimiters + solution to confusion: set possible delimiters as delimiters, and + permit literal searches with "..." or '...' enclosed strings. + */ + + } + else if(0 == strcmp("flystock", next_argument)){ /* dgg */ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = bio_separator_function; + dataops.header_function = bio_header_function; + dataops.finish_header_function = bio_finish_header_function; + dataops.repeat_weight= 0; + dataops.addseparatorwords= true; + dataops.extraheaderweight= false; + dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */ + wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ + dataops.minwordlen= 1; + if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|"); + + /* flystock symbols + valid data []()/-;?+.{} + possible data and delimiter =;. + ;. in text field is del, in data field is data + delimiters *";, + more delimiters (from matthewk) - / {} : + + solution to confusion: set possible delimiters as delimiters, and + permit literal searches with "..." or '...' enclosed strings. + ! want some way to provide field names (report "stylesheet") with + searched/fetched records for flybase, flystock, other data files + ! want "keyword [field]" limited searches for some of this to make sense ! + */ + } + + else if(0 == strcmp("din", next_argument)){ + typename = next_argument; + dataops.type = "TEXT"; + dataops.separator_function = din_separator_function; + dataops.header_function = din_header_function; + dataops.finish_header_function = din_finish_header_function; + } + + #endif + #ifdef NeXT else if(0 == strcmp("objc", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = wobjc_separator_function; ! dataops.header_function = wobjc_header_function; ! dataops.finish_header_function = wobjc_finish_header_function; } #endif /* def NeXT */ else if(0 == strcmp("mail", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = mail_separator_function; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_or_rmail", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = mail_or_rmail_separator; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_digest", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = mail_digest_separator_function; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mh_bboard", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = mh_bboard_separator_function; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rmail", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = rmail_separator_function; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("netnews", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = NULL; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rn", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = rn_separator_function; ! dataops.header_function = mail_header_function; ! dataops.date_function = mail_date_function; ! dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("emacsinfo", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = emacs_info_separator_function; ! dataops.header_function = emacs_info_header_function; ! dataops.finish_header_function = emacs_info_finish_header_function; } else if(0 == strcmp("catalog", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = catalog_separator_function; ! dataops.header_function = catalog_header_function; ! dataops.finish_header_function = catalog_finish_header_function; } else if(0 == strcmp("bio", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = bio_separator_function; ! dataops.header_function = bio_header_function; ! dataops.finish_header_function = bio_finish_header_function; } else if(0 == strcmp("cmapp", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = cmapp_separator_function; ! dataops.header_function = cmapp_header_function; ! dataops.finish_header_function = cmapp_finish_header_function; } else if(0 == strcmp("ftp", next_argument)){ ! dataops.type = "TEXT-FTP"; typename = next_argument; ! dataops.separator_function = first_line_separator_function; ! dataops.header_function = first_line_header_function; ! dataops.finish_header_function = first_line_finish_header_function; } else if(0 == strcmp("jargon", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = jargon_separator_function; ! dataops.header_function = jargon_header_function; ! dataops.finish_header_function = jargon_finish_header_function; } else if(0 == strcmp("server", next_argument)){ typename = next_argument; ! dataops.type = "WSRC"; ! dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("text", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; check_for_text_file = true; } else if(0 == strcmp("filename", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("irg", next_argument)){ typename = next_argument; ! dataops.type = "TEXT"; ! dataops.separator_function = irg_separator_function; ! dataops.header_function = irg_header_function; ! dataops.finish_header_function = irg_finish_header_function; } /* dash-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("dash", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = dash_separator_function; ! dataops.header_function = dash_header_function; ! dataops.finish_header_function = dash_finish_header_function; } /* one_line-separated items */ else if(0 == strcmp("one_line", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = one_line_separator_function; ! dataops.header_function = one_line_header_function; ! dataops.finish_header_function = one_line_finish_header_function; } /* blank line-separated items (paragraphs) */ else if(0 == strcmp("para", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = para_separator_function; ! dataops.header_function = para_header_function; ! dataops.finish_header_function = para_finish_header_function; } /* seeker items */ else if(0 == strcmp("seeker", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = seeker_separator_function; ! dataops.header_function = seeker_header_function; ! dataops.finish_header_function = seeker_finish_header_function; } /* medline format */ else if(0 == strcmp("medline", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = medline_separator_function; ! dataops.header_function = medline_header_function; ! dataops.finish_header_function = medline_finish_header_function; } /* refer format */ else if(0 == strcmp("refer", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = refer_separator_function; ! dataops.header_function = refer_header_function; ! dataops.finish_header_function = refer_finish_header_function; } /* first_line format */ else if(0 == strcmp("first_line", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = first_line_separator_function; ! dataops.header_function = first_line_header_function; ! dataops.finish_header_function = first_line_finish_header_function; } /* rlin items */ else if(0 == strcmp("rlin", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = rlin_separator_function; ! dataops.header_function = rlin_header_function; ! dataops.finish_header_function = rlin_finish_header_function; } else if(0 == strcmp("dvi", next_argument)){ typename = next_argument; ! dataops.type = "DVI"; ! dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("ps", next_argument)){ typename = next_argument; ! dataops.type = "PS"; ! dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("pict", next_argument)){ typename = next_argument; ! dataops.type = "PICT"; ! dataops.finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("gif", next_argument)){ typename = next_argument; ! dataops.type = "GIF"; ! dataops.finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("tiff", next_argument)){ typename = next_argument; ! dataops.type = "TIFF"; ! dataops.finish_header_function = filename_finish_header_function; index_contents = false; } /* BibTeX items */ else if(0 == strcmp("bibtex", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = bibtex_separator_function; ! dataops.header_function = bibtex_header_function; ! dataops.finish_header_function = bibtex_finish_header_function; } /* ?:? seperated hypertext items */ else if(0 == strcmp("nhyp", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = nhyp_separator_function; ! dataops.header_function = nhyp_header_function; ! dataops.finish_header_function = nhyp_finish_header_function; } else if(0 == strcmp("ziff", next_argument)){ ! dataops.type = "TEXT"; typename = next_argument; ! dataops.separator_function = ziff_separator_function; ! dataops.header_function = ziff_header_function; ! dataops.finish_header_function = ziff_finish_header_function; } else{ panic("Don't recognize the '%s' type", next_argument); *************** *** 636,641 **** --- 842,852 ---- if (db == NULL) panic("unable to open the database"); } + + #ifdef BIO + write_delimiters(gDelimiters, db); + #endif + { /* set up the memory hashtable */ if(memory_to_use < 0){ /* default */ *************** *** 677,683 **** while(NULL != next_argument){ /* the first filename is in next_argument already */ if(directoryp(next_argument)){ if(traverse_directory){ ! index_directory(next_argument, separator_function, header_function, date_function, --- 888,898 ---- while(NULL != next_argument){ /* the first filename is in next_argument already */ if(directoryp(next_argument)){ if(traverse_directory){ ! index_directory(next_argument, &dataops, db, ! check_for_text_file, ! adding_to_existing_index, ! word_positions, word_pairs); ! /* index_directory(next_argument, separator_function, header_function, date_function, *************** *** 685,697 **** type, db, check_for_text_file, adding_to_existing_index, ! word_positions, word_pairs); } } else{ /* not a directory */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", next_argument); ! index_text_file(next_argument, separator_function, header_function, date_function, --- 900,915 ---- type, db, check_for_text_file, adding_to_existing_index, ! word_positions, word_pairs, minwordlen); */ } } else{ /* not a directory */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", next_argument); ! index_text_file(next_argument, &dataops, db, ! check_for_text_file, adding_to_existing_index, ! word_positions, word_pairs); ! /* index_text_file(next_argument, separator_function, header_function, date_function, *************** *** 698,704 **** finish_header_function, type, db, check_for_text_file, adding_to_existing_index, ! word_positions, word_pairs); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { --- 916,922 ---- finish_header_function, type, db, check_for_text_file, adding_to_existing_index, ! word_positions, word_pairs, minwordlen); */ } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { diff -bwcr wais-8-b5/ir/ircfiles.c w8b5bio/ir/ircfiles.c *** wais-8-b5/ir/ircfiles.c Wed May 6 19:27:43 1992 --- w8b5bio/ir/ircfiles.c Sun Nov 8 19:09:51 1992 *************** *** 104,109 **** --- 104,113 ---- #define MAX_AUTHOR_LEN 25 #define MAX_DATE_LEN 4 + static char *months[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL}; + + static char* trim_trailing_newline _AP((char* string)); static char* *************** *** 117,122 **** --- 121,1328 ---- return(string); } + + #ifdef BIO + + char bio_header1[MAX_HEADER_LEN + 1]; + char bio_header2[MAX_HEADER_LEN + 1]; + + + /* ========================================== + * + * === Genbank Flat-file Customizations === + * + * d.g.gilbert, 15feb92, + * gilbertd@bio.indiana.edu + * + * ========================================== + */ + + #define genbank_data_tab 12 + #define genbank_date_tab 63 + + /* Genbank Flat-file format: + LOCUS ACAAC01 1571 bp ds-DNA INV 05-NOV-1991 << start entry + 12345678901234567890123456789012345678901234567890123456789012345678901234567890 + .........1.........2.........3.........4.........5.........6.........7.........8 + all data starts at tab=13 + on locus line, data starts at tab=63 + ... + LOCUS blah << Start entry, index LOCUS_NAME, includes DATE + DEFINITION blah << Index def line == HEADER line + ACCESSION blah << Index acc line + KEYWORDS blah << index keywords + SOURCE blah << index source + ORGANISM blah << index organism + blah << index taxonomy + blah << " + AUTHORS blah << Index + TITLE blah << Index + blah << Index + ANYOTHERS jazz << skipit + // << end of entry == entry separator + LOCUS ACAAC01 1571 bp ds-DNA INV 05-NOV-1991 << start entry + DEFINITION Acanthamoeba castelani gene encoding actin I. + ACCESSION V00002 J01016 + KEYWORDS actin. + SOURCE Acanthamoeba castellanii DNA. + ORGANISM Acanthamoeba castellanii + Eukaryota; Animalia; Protozoa; Sarcomastigophora; Sarcodina; + Rhizopoda; Lobosa; Gymnamoeba; Amoebida; Acanthopodina; + Acanthamoebidae. + REFERENCE 1 (bases 1 to 1571) + AUTHORS Nellen,W. and Gallwitz,D. + TITLE Actin genes and actin messenger RNA in Acanthamoeba castellani. + Nucleotide sequence of the split actin gene I + JOURNAL J. Mol. Biol. 159, 1-18 (1982) + COMMENT SWISS-PROT; P02578; ACT1$ACACA. + From EMBL 26 entry ACAC01; dated 22-JAN-1991. + + FEATURES Location/Qualifiers + >>> ignore all features + + BASE COUNT 313 a 535 c 389 g 334 t + ORIGIN + 1 ggagaagcgt gcacgcaata accaagcgac agagcaacct ctctggcacc acgccccaca + >>> ignore all seq data in indexing + // <<< end of entry + LOCUS ACAMHCA 5894 bp ds-DNA INV 30-SEP-1988 + ... + *****/ + + + static boolean keepindexing = false; + + void genbank_filter_for_index(line) + char* line; + { + /* check whether to index anything in line, + * call this from genbank_header_function which is called for + * each line. + * Blank out parts of line not for indexing... + */ + char *c; + long i; + + if (strlen(line) <= genbank_data_tab) { + for (c=line ; *c>=' '; c++) *c=' '; + keepindexing= false; + } + + else if (substrcmp(line, " ")) { + /* most lines are like this, including nucleotides */ + if (!keepindexing) for (c=line ; *c>=' '; c++) *c=' '; + } + + /******* + else if (substrcmp(line, "LOCUS ")){ + // I think this is bad, locus not in index ... + for (c=line, i=0; *c>=' ' && i' '; c++) ; // leave LOCUS ID intact + for ( ; *c>=' '; c++) *c=' '; + keepindexing= false; + } + ******/ + + else if ( + substrcmp(line, "DEFINITION ") + || substrcmp(line, "LOCUS ") + || substrcmp(line, "ACCESSION ") + || substrcmp(line, "KEYWORDS ") + || substrcmp(line, "SOURCE ") + || substrcmp(line, " ORGANISM ") + || substrcmp(line, " AUTHORS ") + || substrcmp(line, " TITLE ") + ){ + for (c=line, i=0; *c>=' ' && i=' '; c++) *c=' '; + keepindexing= false; + } + } + + + + boolean genbank_separator_function(line) + char *line; + { + /* !! with // as separator, we get // at top of entry which will + screw up seqanal software... */ + /* if ((strlen(line) > 1) && (0==strncmp(line, "//", 2))){ + return(true); + } + */ + if ((strlen(line) > genbank_data_tab) && substrcmp(line, "LOCUS ")){ + return(true); + } + else{ + return(false); + } + } + + long genbank_getdate(line) + char *line; + { + /* genbank date == 30-SEP-1988*/ + char date[255], *temp; + int day, month, year; + char cmonth[25]; + + strcpy(date, line); + + temp = date; + + while(!isdigit(*temp)) temp++; + + /* sscanf(temp, "%d %s %d", &day, cmonth, &year); */ + sscanf(temp, "%d-%s-%d", &day, cmonth, &year); + + for(month = 0; months[month] != NULL; month++) + /* if(!strcmp(cmonth, months[month])) break; */ + if(!strcasecmp(cmonth, months[month])) break; /* was stricmp !! */ + + if (year > 99) year = year % 100; + + if(day > 0 && + month < 12 && + year > 0) { + return (10000 * year + 100 * (month+1) + day); + } + return 0; + } + + long genbank_date_function(line) + char *line; + { + if ((strlen(line) > genbank_data_tab) && substrcmp(line, "LOCUS ")){ + return(genbank_getdate(line+genbank_date_tab)); + } + else + return -1; + } + + + + char *genbank_def = bio_header1; + char *genbank_accession= bio_header2; + + void genbank_header_function(line) + char *line; + { + if ((strlen(line) > genbank_data_tab) && substrcmp(line, "DEFINITION ") && + (strlen(genbank_def) == 0)){ + strncpy(genbank_def, line + genbank_data_tab, MAX_HEADER_LEN); + trim_trailing_newline(genbank_def); + } + + else if ((strlen(line) > genbank_data_tab) && + substrcmp(line, "ACCESSION ") && + (strlen(genbank_accession) == 0)){ + /* cut extra acc. numbers from this -- we want only 1st */ + char *cp; + for (cp=line+genbank_data_tab; *cp==' '; cp++) ; + strncpy(genbank_accession, cp, MAX_HEADER_LEN); + cp= strchr(genbank_accession, ' '); + if (cp!=NULL) *cp=0; /* drop after 1st */ + trim_trailing_newline(genbank_accession); + } + + genbank_filter_for_index( line); + + } + + void genbank_finish_header_function(header) + char *header; + { + if(strlen(genbank_def) != 0 && strlen(genbank_accession) != 0){ + strncpy(header, genbank_accession, MAX_HEADER_LEN); + s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); + s_strncat(header, genbank_def, MAX_HEADER_LEN, MAX_HEADER_LEN); + } + else if(strlen(genbank_def) != 0){ + strncpy(header, genbank_def, MAX_HEADER_LEN); + } + else if(strlen(genbank_accession) != 0){ + strncpy(header, genbank_accession, MAX_HEADER_LEN); + } + else{ + strcpy(header, "Unknown Entry"); + } + genbank_def[0] = '\0'; + genbank_accession[0] = '\0'; + } + + + /* ========================================== + * + * === PIR Protein Customizations === + * + * d.g.gilbert, 11Mar92, + * gilbertd@bio.indiana.edu + * + * ========================================== + */ + + #define pir_data_tab 16 + + /* pir Flat-file format: + ENTRY CCHU #Type Protein + 12345678901234567890123456789012345678901234567890123456789012345678901234567890 + .........1.........2.........3.........4.........5.........6.........7.........8 + all data starts at tab=17 or further + + ENTRY blah << Start entry, index it + TITLE blah << Index def line == HEADER line + ACCESSION blah << Index acc line + KEYWORDS blah << index keywords + SOURCE blah << index source + REFERENCE blah << Index + SUPERFAMILY blah << Index + blah << Index + ANYOTHERS jazz << skipit + any word starting w/ "#", skipit + /// << end of entry == entry separator + ENTRY CCHU #Type Protein + TITLE Cytochrome c - Human + DATE #Sequence 30-Sep-1991 #Text 30-Sep-1991 + PLACEMENT 1.0 1.0 1.0 1.0 1.0 + SOURCE Homo sapiens #Common-name man + ACCESSION A31764\ A05676\ A00001 + REFERENCE + #Authors Evans M.J., Scarpulla R.C. + #Journal Proc. Natl. Acad. Sci. U.S.A. (1988) 85:9625-9629 + #Title The human somatic cytochrome c gene: two classes of + processed pseudogenes demarcate a period of rapid + molecular evolution. + #Reference-number A31764 + #Accession A31764 + #Molecule-type DNA + #Residues 1-105 + #Cross-reference GB:M22877 + REFERENCE + #Authors Matsubara H., Smith E.L. + #Journal J. Biol. Chem. (1963) 238:2732-2753 + #Reference-number A05676 + #Accession A05676 + #Molecule-type protein + #Residues 2-28;29-46;47-100;101-105 + REFERENCE + #Authors Matsubara H., Smith E.L. + #Journal J. Biol. Chem. (1962) 237:3575-3576 + #Reference-number A00001 + #Comment 66-Leu is found in 10% of the molecules in pooled + protein. + GENETIC + #Introns 57/1 + SUPERFAMILY #Name cytochrome c + KEYWORDS acetylation\ electron transport\ heme\ + mitochondrion\ oxidative phosphorylation\ + polymorphism\ respiratory chain + FEATURE + 2-105 #Protein cytochrome c (experimental) + \ + 2 #Modified-site acetylated amino end + (experimental)\ + 15,18 #Binding-site heme (covalent)\ + 19,81 #Binding-site heme iron (axial ligands) + SUMMARY #Molecular-weight 11749 #Length 105 #Checksum 3247 + SEQUENCE + 5 10 15 20 25 30 + 1 M G D V E K G K K I F I M K C S Q C H T V E K G G K H K T G + 31 P N L H G L F G R K T G Q A P G Y S Y T A A N K N K G I I W + 61 G E D T L M E Y L E N P K K Y I P G T K M I F V G I K K K E + 91 E R A D L I A Y L K K A T N E + /// + + *****/ + + + void pir_filter_for_index(line) + char* line; + { + /* check whether to index anything in line, + * call this from pir_header_function which is called for + * each line. + * Blank out parts of line not for indexing... + */ + char *c; + long i; + + if (strlen(line) <= pir_data_tab) { + for (c=line ; *c>=' '; c++) *c=' '; + keepindexing= false; + } + + /* drop some ref junk that is not of much indexing interest... */ + else if (substrcmp(line, " #Reference-number ") + || substrcmp(line, " #Residues ") + || substrcmp(line, " #Accession ") + || substrcmp(line, " #Residues ") + || substrcmp(line, " #Cross-reference ") + || substrcmp(line, " #Molecule-type ") + || substrcmp(line, " #Journal ") ) { + for (c=line ; *c>=' '; c++) *c=' '; + /* keepindexing is based on last main keyword (ENTRY, REF...) */ + } + + else if (substrcmp(line, " ")) { + /* some good & bad continuation lines start like this */ + if (!keepindexing) for (c=line ; *c>=' '; c++) *c=' '; + } + + else if ( + substrcmp(line, "ENTRY ") + || substrcmp(line, "TITLE ") + || substrcmp(line, "SOURCE ") + || substrcmp(line, "ACCESSION ") + || substrcmp(line, "REFERENCE") + /* REFERENCE line seems to have no data on line, but it follows (keepindexing) */ + || substrcmp(line, "SUPERFAMILY ") + || substrcmp(line, "KEYWORDS ") + ){ + for (c=line, i=0; *c>=' ' && i=' '; c++) *c=' '; + keepindexing= false; + } + + /* pir -- blank out #words */ + for (c=line; *c != 0; ) { + if (*c=='#') do { *c++=' '; } while (*c > ' '); + else c++; + } + + } + + + + boolean pir_separator_function(line) + char *line; + { + /* !! with /// as separator, we get /// at top of entry which will + screw up seqanal software... */ + /* if ((strlen(line) > 1) && (0==strncmp(line, "///", 2))){ + return(true); + } + */ + if ((strlen(line) > pir_data_tab) && substrcmp(line, "ENTRY ")){ + return(true); + } + else{ + return(false); + } + } + + + long pir_date_function(line) + char *line; + { /* later maybe */ + return -1; + } + + + + char *pir_def = bio_header1; + char *pir_accession= bio_header2; + + void pir_header_function(line) + char *line; + { + if ((strlen(line) > pir_data_tab) && + substrcmp(line, "TITLE ") && + (strlen(pir_def) == 0)){ + strncpy(pir_def, line + pir_data_tab, MAX_HEADER_LEN); + trim_trailing_newline(pir_def); + } + + else if ((strlen(line) > pir_data_tab) && + substrcmp(line, "ACCESSION ") && + (strlen(pir_accession) == 0)){ + /* cut extra acc. numbers from this -- we want only 1st */ + char *cp; + for (cp=line+pir_data_tab; *cp==' '; cp++) ; + strncpy(pir_accession, cp, MAX_HEADER_LEN); + cp= strchr(pir_accession, ' '); + if (cp!=NULL) *cp=0; /* drop after 1st */ + trim_trailing_newline(pir_accession); + } + + pir_filter_for_index( line); + + } + + void pir_finish_header_function(header) + char *header; + { + if(strlen(pir_def) != 0 && strlen(pir_accession) != 0){ + strncpy(header, pir_accession, MAX_HEADER_LEN); + s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); + s_strncat(header, pir_def, MAX_HEADER_LEN, MAX_HEADER_LEN); + } + else if(strlen(pir_def) != 0){ + strncpy(header, pir_def, MAX_HEADER_LEN); + } + else if(strlen(pir_accession) != 0){ + strncpy(header, pir_accession, MAX_HEADER_LEN); + } + else{ + strcpy(header, "Unknown Entry"); + } + pir_def[0] = '\0'; + pir_accession[0] = '\0'; + } + + + + /* ========================================== + * === EMBL Flat-file Customizations === + * d.g.gilbert, 23Feb92, + * ========================================== + */ + + #define embl_data_tab 5 + + /* EMBL Flat-file format: + + ID BAAMYLA standard; DNA; PRO; 7872 BP. + 1234567890 + XX + AC X62835; + XX + DT 12-NOV-1991 (Rel. 29, Last updated, Version 1) + DT 12-NOV-1991 (Rel. 29, Created) + XX + DE B.acidocaldarius amy gene for amylase + XX + KW amy gene; amylase. + XX + OS Bacillus acidocaldarius + OC Prokaryota; Bacteria; Firmicutes; Endospore-forming rods and cocci; + OC Bacillaceae; Bacillus. + XX + RN [1] + RP 1-7872 + RA Hemila H.O.; + RT ; + RL Submitted (22-OCT-1991) on tape to the EMBL Data Library by: + RL H.O. Hemila, Institute of Biotechnology, Valimotie 7, 00380 + RL Helsinki, FINLAND + XX + RN [2] + RP 1-7872 + RA Koivula T., Hemilae H.; + RT ; + RL Unpublished. + XX + CC *source: strain=ATCC 27009; + CC *source: clone_library=lambda gt-10; + XX + FH Key Location/Qualifiers + FH + FT -35_signal 3224..3229 + FT -10_signal 3246..3251 + FT RBS 3288..3294 + FT /note="amy gene" + FT CDS 3297..7202 + FT /gene="amy" /product="amylase" + FT CDS 7332..>7872 + FT /product="malE protein-homologue" + XX + SQ Sequence 7872 BP; 1615 A; 2240 C; 2473 G; 1544 T; 0 other; + cgttcctcgt gccgtccgaa gcgttcccga cgaatctgcg cggcaccgcc gcgggatctc + // + *****/ + + + + void embl_filter_for_index(line) + char* line; + { + /* check whether to index anything in line, + * call this from embl_header_function which is called for + * each line. + * Blank out parts of line not for indexing... + */ + char *c; + long i; + + if (strlen(line) <= embl_data_tab) { + for (c=line; *c>=' '; c++) *c=' '; + } + + else if ( + substrcmp(line, "DE ") + || substrcmp(line, "ID ") + || substrcmp(line, "AC ") + || substrcmp(line, "KW ") + || substrcmp(line, "OS ") + || substrcmp(line, "OC ") + || substrcmp(line, "RA ") + || substrcmp(line, "RT ") + ){ + for (c=line, i=0; *c>=' ' && i=' '; c++) *c=' '; + } + } + + + + boolean embl_separator_function(line) + char *line; + { + /* !! with // as separator, we get // at top of entry which will + screw up seqanal software... */ + /* if ((strlen(line) > 1) && (0==strncmp(line, "//", 2))){ + return(true); + } + */ + if ((strlen(line) > embl_data_tab) && substrcmp(line, "ID ")){ + return(true); + } + else{ + return(false); + } + } + + /* embl date == 30-SEP-1988 == genbank_date*/ + + long embl_date_function(line) + char *line; + { + if ((strlen(line) > embl_data_tab) && substrcmp(line, "DT ")){ + return(genbank_getdate(line+embl_data_tab)); + } + else + return -1; + } + + + + char *embl_def = bio_header1; + char *embl_accession= bio_header2; + + void embl_header_function(line) + char *line; + { + if ((strlen(line) > embl_data_tab) && + substrcmp(line, "DE ") && + (strlen(embl_def) == 0)){ + strncpy(embl_def, line + embl_data_tab, MAX_HEADER_LEN); + trim_trailing_newline(embl_def); + } + + else if ((strlen(line) > embl_data_tab) && + substrcmp(line, "AC ") && + (strlen(embl_accession) == 0)){ + /* cut extra acc. numbers from this -- we want only 1st */ + char *cp; + for (cp=line+embl_data_tab; *cp==' '; cp++) ; + strncpy(embl_accession, cp, MAX_HEADER_LEN); + cp= strchr(embl_accession, ' '); + if (cp!=NULL) *cp=0; /* drop after 1st */ + trim_trailing_newline(embl_accession); + } + + embl_filter_for_index( line); + + } + + void embl_finish_header_function(header) + char *header; + { + if(strlen(embl_def) != 0 && strlen(embl_accession) != 0){ + strncpy(header, embl_accession, MAX_HEADER_LEN); + s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); + s_strncat(header, embl_def, MAX_HEADER_LEN, MAX_HEADER_LEN); + } + else if(strlen(embl_def) != 0){ + strncpy(header, embl_def, MAX_HEADER_LEN); + } + else if(strlen(embl_accession) != 0){ + strncpy(header, embl_accession, MAX_HEADER_LEN); + } + else{ + strcpy(header, "Unknown Entry"); + } + embl_def[0] = '\0'; + embl_accession[0] = '\0'; + } + + + + /* ========================================== + * + * === Prosite Dat & Doc Customizations === + * + * d.g.gilbert, 18feb92, + * gilbertd@bio.indiana.edu + * + * ========================================== + */ + + #define prositedat_data_tab 5 + + /* Prosite DOC format: + + {END} + {PDOC00002} + {PS00002; GLYCOSAMINOGLYCAN} + {BEGIN} + ************************************* + * Glycosaminoglycan attachment site * + ************************************* + + Proteoglycans [1] are complex glycoconjugates consisting of a core protein to + which a variable number of glycosaminoglycan chains (such as heparin sulfate, + chondroitin sulfate, etc.) are covalently attached. The glycosaminoglycans are + attached to the core proteins through a xyloside residue which is in turn is + linked to a serine residue of the protein. A consensus sequence for the + attachment site seems to exist [2]. However, it must be noted that this + consensus is only based on the sequence of three proteoglycans core proteins. + + -Consensus pattern: S-G-x-G + [S is the attachment site] + Additional rule: There must be at least two acidic amino acids from -2 to -4 + relative to the serine. + -Last update: June 1988 / First entry. + + [ 1] Hassel J.R., Kimura J.H., Hascall V.C. + Annu. Rev. Biochem. 55:539-567(1986). + [ 2] Bourdon M.A., Krusius T., Campbell S., Schwarz N.B. + Proc. Natl. Acad. Sci. U.S.A. 84:3194-3198(1987). + {END} + {PDOC00003} + {PS00003; SULFATATION} + {BEGIN} + + *****/ + + /* Prosite DAT format: + // + ID ASN_GLYCOSYLATION; PATTERN. + 1234567890 + AC PS00001; + DT APR-1990 (CREATED); APR-1990 (DATA UPDATE); APR-1990 (INFO UPDATE). + DE N-glycosylation site. + PA N-{P}-[ST]-{P}. + CC /TAXO-RANGE=??E?V; + CC /SITE=1,carbohydrate; + CC /SKIP-FLAG=TRUE; + DO PDOC00001; + // + + *****/ + + boolean prositedoc_separator_function(line) + char *line; + { + if ((strlen(line) > strlen("{END}")) && substrcmp(line, "{END}")){ + return(true); + } + else{ + return(false); + } + } + + + char *prositedoc_def = bio_header1; + char *prositedoc_accession= bio_header2; + + void prositedoc_header_function(line) + char *line; + { + if ((strlen(line)>2) && (line[0]=='*') && (line[1]==' ') && + (strlen(prositedoc_def) == 0)){ + strncpy(prositedoc_def, line + 2, MAX_HEADER_LEN); + trim_trailing_newline(prositedoc_def); + } + else if ((strlen(line)>2) && (line[0]=='{') && + (!substrcmp(line, "{END}")) && + (strlen(prositedoc_accession) == 0)){ + char *cp; + strncpy(prositedoc_accession, line+1, MAX_HEADER_LEN); + cp= strchr(prositedoc_accession, '}'); + if (cp!=NULL) *cp=0; + trim_trailing_newline(prositedoc_accession); + } + + } + + void prositedoc_finish_header_function(header) + char *header; + { + if(strlen(prositedoc_def) != 0 && strlen(prositedoc_accession) != 0){ + strncpy(header, prositedoc_accession, MAX_HEADER_LEN); + s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); + s_strncat(header, prositedoc_def, MAX_HEADER_LEN, MAX_HEADER_LEN); + } + else if(strlen(prositedoc_def) != 0){ + strncpy(header, prositedoc_def, MAX_HEADER_LEN); + } + else if(strlen(prositedoc_accession) != 0){ + strncpy(header, prositedoc_accession, MAX_HEADER_LEN); + } + else{ + strcpy(header, "Unknown Entry"); + } + prositedoc_def[0] = '\0'; + prositedoc_accession[0] = '\0'; + } + + + boolean prositedat_separator_function(line) + char *line; + { + /* !! with // as separator, we get // at top of entry which will + screw up seqanal software... */ + /* if ((strlen(line) > 1) && (0==strncmp(line, "//", 2))){ + return(true); + } + */ + if ((strlen(line) > prositedat_data_tab) && substrcmp(line, "ID ")){ + return(true); + } + else{ + return(false); + } + } + + + char *prositedat_def = bio_header1; + char *prositedat_accession= bio_header2; + + void prositedat_header_function(line) + char *line; + { + int i; + + if ((strlen(line) > prositedat_data_tab) && + substrcmp(line, "DE ") && + (strlen(prositedat_def) == 0)){ + strncpy(prositedat_def, line + prositedat_data_tab, MAX_HEADER_LEN); + trim_trailing_newline(prositedat_def); + } + + else if ((strlen(line) > prositedat_data_tab) && + substrcmp(line, "AC ") && + (strlen(prositedat_accession) == 0)){ + /* cut extra acc. numbers from this -- we want only 1st */ + char *cp; + for (cp=line+prositedat_data_tab; *cp==' '; cp++) ; + strncpy(prositedat_accession, cp, MAX_HEADER_LEN); + cp= strchr(prositedat_accession, ' '); + if (cp!=NULL) *cp=0; /* drop after 1st */ + trim_trailing_newline(prositedat_accession); + } + + if (strlen(line) > prositedat_data_tab) + for (i=0; i 1) && (0==strncmp(line, "//", 2))){ + return(true); + } + /* if ((strlen(line) > biojournal_tab) && substrcmp(line, "RA ")){ + return(true); + } + */ + else{ + return(false); + } + } + + + char *biojournal_title = bio_header1; + char *biojournal_author= bio_header2; + + void biojournal_header_function(line) + char *line; + { + int i; + + if ((strlen(line) > biojournal_tab) && substrcmp(line, "RT ") && + (strlen(biojournal_title) == 0)){ + strncpy(biojournal_title, line + biojournal_tab, MAX_HEADER_LEN); + trim_trailing_newline(biojournal_title); + } + + else if ((strlen(line) > biojournal_tab) && substrcmp(line, "RA ") && + (strlen(biojournal_author) == 0)){ + char *cp; + strncpy(biojournal_author, line+biojournal_tab, MAX_HEADER_LEN); + cp= strchr(biojournal_author, ' '); + if (cp!=NULL) *cp=0; /* drop after 1st */ + trim_trailing_newline(biojournal_author); + } + + if (strlen(line) > biojournal_tab) + for (i=0; i= 3) && substrcmp(line, "***")) { + return(true); + } + else{ + return(false); + } + } + + + void din_header_function(line) + char *line; + { + if(din_hit_head /* we just hit a seperator previous to this */ + && strlen(line) > 3 /* line is valid */ + && isalnum(*line) /* and is word */ + && (!din_separator_function(line)) /* we are not on the separator now */ + && strlen(din_header) == 0){ /* and we have not saved the headline yet */ + strcpy(din_header, line); + waislog(WLOG_MEDIUM, WLOG_INDEX, "storing line: %s", din_header); + din_hit_head = false; + } + } + + void din_finish_header_function(header) + char *header; + { + din_hit_head = true; /* turn on the flag */ + if(strlen(din_header) == 0){ + strcpy(header, "Unknown Title"); + } + else{ + strcpy(header, din_header); + } + din_header[0] = '\0'; + } + + + + #endif /* BIO */ + + /* ================================= * === Groliers Customizations === * ================================= *************** *** 196,203 **** I'm open to better code. - Jonny G */ - static char *months[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL}; long my_getdate(line) char *line; --- 1402,1407 ---- *************** *** 888,895 **** --- 2092,2118 ---- * just like 'mail'. * * wollman@uvm.edu, Sun Sep 8 20:12:21 EDT 1991 + * + * dgg added "Path:" fix for netnews/NNTP fetches (!NOT MAIL, NO "From ") + * gilbertd@sunflower.bio.indiana.edu */ boolean rn_separator_function(line) + char *line; + { + if(!strncmp(line,"From ",5) || + !strncmp(line,"Path: ",6) || + !strncmp(line,"Article ",7) || + !strncmp(line,"Article: ",9)) + return true; + return false; + } + + /* + * Customization for files saved NNTP netnews fetches (!NOT MAIL FORMAT, NO "From ". + * + * gilbertd@sunflower.bio.indiana.edu + */ + boolean netnews_separator_function(line) char *line; { if(!strncmp(line,"From ",5) || diff -bwcr wais-8-b5/ir/ircfiles.h w8b5bio/ir/ircfiles.h *** wais-8-b5/ir/ircfiles.h Wed May 6 19:29:27 1992 --- w8b5bio/ir/ircfiles.h Sun Nov 8 19:10:49 1992 *************** *** 32,37 **** --- 32,86 ---- { #endif /* def __cplusplus */ + #ifdef BIO + /* genbank flat files -- dgg*/ + boolean genbank_separator_function _AP((char *line)); + void genbank_header_function _AP((char *line)); + void genbank_finish_header_function _AP((char *header)); + long genbank_date_function _AP((char *line)); + + /* embl flat files -- dgg*/ + boolean embl_separator_function _AP((char *line)); + void embl_header_function _AP((char *line)); + void embl_finish_header_function _AP((char *header)); + long embl_date_function _AP((char *line)); + + /* pir flat files -- dgg*/ + boolean pir_separator_function _AP((char *line)); + void pir_header_function _AP((char *line)); + void pir_finish_header_function _AP((char *header)); + long pir_date_function _AP((char *line)); + + /* prosite protein doc & dat files - dgg */ + boolean prositedoc_separator_function _AP((char *line)); + void prositedoc_header_function _AP((char *line)); + void prositedoc_finish_header_function _AP((char *header)); + + boolean prositedat_separator_function _AP((char *line)); + void prositedat_header_function _AP((char *line)); + void prositedat_finish_header_function _AP((char *header)); + + /* Bionet.journals table of contents format */ + boolean biojournal_separator_function _AP((char *line)); + void biojournal_header_function _AP((char *line)); + void biojournal_finish_header_function _AP((char *header)); + + /* drosophila redbook -dgg */ + boolean redbook_separator_function _AP((char *line)); + void redbook_header_function _AP((char *line)); + void redbook_finish_header_function _AP((char *header)); + + /* drosophila flybase -dgg */ + boolean flybase_separator_function _AP((char *line)); + void flybase_header_function _AP((char *line)); + void flybase_finish_header_function _AP((char *header)); + + boolean din_separator_function _AP((char *line)); + void din_finish_header_function _AP((char *header)); + void din_header_function _AP((char *line)); + + #endif /* BIO */ + /* groliers encyclopedia */ boolean groliers_separator_function _AP((char *line)); void groliers_header_function _AP((char *line)); Only in w8b5bio/ir: irext.h.O diff -bwcr wais-8-b5/ir/irfiles.c w8b5bio/ir/irfiles.c *** wais-8-b5/ir/irfiles.c Thu Apr 30 17:11:18 1992 --- w8b5bio/ir/irfiles.c Sat Oct 31 19:05:43 1992 *************** *** 138,143 **** --- 138,146 ---- #define FILENAME_TABLE_HEADER_SIZE 4 #define HEADLINE_TABLE_HEADER_SIZE 4 + #ifdef BIO + #define DELIMITERS_SIZE 4 + #endif #define FILE_WRITE_DATE_SIZE 4 #define NUMBER_OF_OCCURANCES_SIZE 4 *************** *** 315,320 **** --- 318,335 ---- disposeDatabase(db); return(NULL); } + + #ifdef BIO + db->delimiters_stream = + s_fopen(delimiters_filename(file, db),open_mode); + if (db->delimiters_stream == NULL){ + waislog(WLOG_HIGH, WLOG_ERROR, + "can't open the delimiters file %s, using defaults", file); + /* disposeDatabase(db); */ + /* return(NULL); */ + } + #endif + db->document_table_stream = s_fopen(document_table_filename(file, db),open_mode); if (db->document_table_stream == NULL){ *************** *** 458,463 **** --- 473,483 ---- db->headline_table_stream = initialize_file(HEADLINE_TABLE_HEADER_SIZE, headline_table_filename(file, db), TRUE); + #ifdef BIO + db->delimiters_stream = + initialize_file(DELIMITERS_SIZE, + delimiters_filename(file, db), TRUE); + #endif } /* ========================= */ *************** *** 571,588 **** } #endif static long find_pointer_in_block _AP((char* word,unsigned char* block, long block_length, ! long *position )); /* Courtesy of Simon Spero */ ! static long find_pointer_in_block(word,block,block_length, position) char *word; unsigned char *block; long block_length; /* in entries */ long *position; /* returns 0 if an error or if the word is below the lowest block, (this confusion between error and NULL is bad, but found late in the design process) --- 591,645 ---- } #endif + + #ifdef PARTIALWORD + + typedef struct { + long blocknum, wordcount; + } saveparttype; + + static long gMaxpart = 0; + static long gNpart = 0; + static long gAtpart = 0; + static saveparttype *gSavepart = NULL; + + void clearPartMatch() + { + if (gSavepart!=NULL) free(gSavepart); + gSavepart= NULL; + gMaxpart= 0; + gNpart= 0; + gAtpart= 0; + } + + void savePartMatch( blocknum, wordcount) + long blocknum, wordcount; + { + if (gNpart>=gMaxpart) { + gMaxpart= gNpart + 100; + if (gSavepart==NULL) (void*) gSavepart= malloc(gMaxpart*sizeof(saveparttype)); + else (void*) gSavepart= realloc(gSavepart, gMaxpart*sizeof(saveparttype)); + } + gSavepart[gNpart].blocknum= blocknum; + gSavepart[gNpart].wordcount= wordcount; + gNpart++; + } + #endif + static long find_pointer_in_block _AP((char* word,unsigned char* block, long block_length, ! long *position, ! boolean findpart )); /* Courtesy of Simon Spero */ ! static long find_pointer_in_block(word,block,block_length, position, findpart) char *word; unsigned char *block; long block_length; /* in entries */ long *position; + boolean findpart; /* dgg, partial word match */ /* returns 0 if an error or if the word is below the lowest block, (this confusion between error and NULL is bad, but found late in the design process) *************** *** 597,602 **** --- 654,663 ---- returns 0 if not found. */ /* this could be binary search XXX */ long i,high,low,tmp; + #ifdef PARTIALWORD + long wordlen= strlen(word); + #endif + low = 0; high = block_length; i = (low+high)/2; *************** *** 616,622 **** --- 677,714 ---- return(- dictionary_block_position(i-1,block)); } } else { + #ifdef PARTIALWORD + if (findpart) { + compare = strncmp(dictionary_word, word, wordlen); + if ((0 == compare) /*&& (wordlen < strlen(dictionary_word))*/ ) { + int ati = i; + /* save partword matches for later... */ + savePartMatch( dictionary_block_position(i, block), + dictionary_block_word_occurances(i,block)); + while (i>low && 0 == compare) { + --i; + dictionary_word = dictionary_block_word(i, block); + compare = strncmp(dictionary_word, word, wordlen); + if (0 == compare) savePartMatch( dictionary_block_position(i, block), + dictionary_block_word_occurances(i,block)); + } + i= ati; + while (i 0) return (answer); /* got a match */ + } + + if (gAtpart >= gNpart) { + clearPartMatch(); + return(-1); + } + else { + answer= gSavepart[gAtpart].blocknum; + if (NULL != number_of_occurances) { + if (answer > 0) *number_of_occurances = gSavepart[gAtpart].wordcount; + else *number_of_occurances = 0; + } + gAtpart++; + return( answer); + } + } + #endif + + long look_up_word_in_dictionary(word, number_of_occurances, db) char *word; long *number_of_occurances; *************** *** 713,719 **** --- 839,856 ---- long answer; FILE *stream = db->dictionary_stream; long dictionary_block_pos; + boolean findpart = false; /* dgg, PARTIALWORD flag */ + #ifdef PARTIALWORD + { + int l = strlen(word) - 1; + if (l > 0 && word[l] == PARTWORD_WILDCARD) { + findpart= true; + word[l]= '\0'; + } + } + #endif + if(NULL == dictionary_header_block) { s_fseek(stream, 0L, SEEK_SET); *************** *** 733,739 **** find_pointer_in_block(word, dictionary_header_block, number_of_dictionary_blocks, ! &position); if(0 == dictionary_block_pos) { /* waislog(WLOG_HIGH, WLOG_ERROR, "Could not find pointer for word '%s' (location %ld) in block in db %s!", word, word, db->database_file); */ --- 870,876 ---- find_pointer_in_block(word, dictionary_header_block, number_of_dictionary_blocks, ! &position, false); if(0 == dictionary_block_pos) { /* waislog(WLOG_HIGH, WLOG_ERROR, "Could not find pointer for word '%s' (location %ld) in block in db %s!", word, word, db->database_file); */ *************** *** 751,757 **** return(0); } answer = find_pointer_in_block(word, dictionary_block, ! DICTIONARY_BLOCK_SIZE, &position); if((NULL != number_of_occurances)) { if (answer > 0) *number_of_occurances = --- 888,894 ---- return(0); } answer = find_pointer_in_block(word, dictionary_block, ! DICTIONARY_BLOCK_SIZE, &position, findpart); if((NULL != number_of_occurances)) { if (answer > 0) *number_of_occurances = *************** *** 1450,1456 **** --- 1587,1636 ---- return(free_position); } + #ifdef BIO + /*========================* + *=== delimiters - dgg ===* + *========================*/ + char *read_delimiters(db) + database* db; + /* returns the word delimiters for a database. Beware that + * the next call to this function will overwrite the the headline_array + */ + { + static char delimiters[MAX_HEADLINE_LEN+1]; + FILE *stream = db->delimiters_stream; + delimiters[0] = '\0'; /* init to the empty string */ + + if(false == read_string_from_file(db->delimiters_stream, + delimiters, MAX_HEADLINE_LEN)){ + waislog(WLOG_HIGH, WLOG_ERROR, + "delimiters are corrupt in db %s", db->database_file); + } + /* need to weed out .dlm files that have no symbols... */ + if (delimiters[0] == '\0') return(NULL); + return(delimiters); + } + + /* writes the string to the file followed by a NULL. + * The returned number is the position in the file to start reading. + */ + long write_delimiters(delimiters,db) + char* delimiters; + database* db; + { + /* writes the headline followed by a newline. + Returns the postion of the headline. + */ + long free_position; + s_fseek(db->delimiters_stream, 0L, SEEK_SET); /* _SET, only one set of delims / file ? */ + free_position = ftell(db->delimiters_stream); + fprintf(db->delimiters_stream, "%s", delimiters); + fputc(0, db->delimiters_stream); + return(free_position); + } + #endif + /* =================== */ /* === Source file === */ /* =================== */ *************** *** 1666,1671 **** --- 1844,1861 ---- s_strncat(destination,headline_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } + + #ifdef BIO + char* delimiters_filename(destination,db) + char* destination; + database* db; + { + strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); + s_strncat(destination,delimiters_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); + return(destination); + } + #endif + char* index_filename(destination,db) char* destination; database* db; diff -bwcr wais-8-b5/ir/irfiles.h w8b5bio/ir/irfiles.h *** wais-8-b5/ir/irfiles.h Thu Apr 16 22:04:45 1992 --- w8b5bio/ir/irfiles.h Sun Nov 8 19:12:09 1992 *************** *** 35,40 **** --- 35,43 ---- #define index_ext ".inv" #define source_ext ".src" #define catalog_ext ".cat" + #ifdef BIO + #define delimiters_ext ".dlm" /* dgg */ + #endif /* these dictionary definitions are used in irhash,irverify, and irfiles */ #define DICTIONARY_HEADER_SIZE 4 *************** *** 47,56 **** --- 50,75 ---- #define DICTIONARY_TOTAL_SIZE_WORD "{}" /* the word that holds the total number of words in the whole dictionary */ #define INDEX_HEADER_SIZE 4 + #ifdef BIO + + /* !! Bug in W8B5 -- Increasing this INDEX_BLOCK_SIZE_SIZE above 2 now fails !! */ + /* it worked in W8B3 ... both 3 and 4 fail now */ + #define INDEX_BLOCK_SIZE_SIZE 2 /* was 2, genbank wants 3, dgg */ + + #else #define INDEX_BLOCK_SIZE_SIZE 2 + #endif + #define NEXT_INDEX_BLOCK_SIZE 4 #define INDEX_BLOCK_FLAG_SIZE 1 + + /* dgg -- this is a bug #define INDEX_BLOCK_HEADER_SIZE 7 + This == 7 only if the component SIZEs don't change + */ + #define INDEX_BLOCK_HEADER_SIZE (INDEX_BLOCK_SIZE_SIZE+NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_FLAG_SIZE) + + #define NUMBER_OF_OCCURANCES_SIZE 4 #define INDEX_BLOCK_NOT_FULL_FLAG 101 #define INDEX_BLOCK_FULL_FLAG 69 *************** *** 63,68 **** --- 82,105 ---- #define INDEX_ELEMENT_SIZE 8 #define WORD_ID_SIZE 4 /* for posting arrays */ + #ifdef BOOLEANS /* dgg */ + #define BOOLEAN_AND "and" /* may prefer "&", but need symbol fix */ + #define BOOLEAN_NOT "not" /* may prefer "!", but need symbol fix */ + #define BOOLEAN_NOT_FLAG -91 /* stick in weight param as flag for search_word */ + #endif + + #ifdef PARTIALWORD /* dgg */ + #define PARTWORD_WILDCARD '*' + #endif + + #ifdef LITERAL /* dgg */ + #define LITERAL_KEY1 '"' + #define LITERAL_KEY2 0x27 /* single quote ' dgg */ + #define LITERAL_FLAG -92 /* stick in weight param as flag for search_word */ + #define MAX_PHRASE_LENGTH 200 + #endif + + typedef struct database { char* database_file; FILE* dictionary_stream; *************** *** 70,75 **** --- 107,115 ---- FILE* headline_table_stream; FILE* document_table_stream; FILE* index_stream; + #ifdef BIO + FILE* delimiters_stream; + #endif long doc_table_allocated_entries; hashtable* the_word_memory_hashtable; *************** *** 131,136 **** --- 171,180 ---- char *read_headline_table_entry _AP((long position,database* db)); long write_headline_table_entry _AP((char* headline, database* db)); + #ifdef BIO + char *read_delimiters _AP((database* db)); + long write_delimiters _AP((char* delimiters, database* db)); + #endif boolean read_document_table_entry _AP((document_table_entry* doc_entry,long number,database* db)); *************** *** 150,155 **** --- 194,202 ---- long add_word_to_dictionary _AP((char *word, long index_file_block_number, long number_of_occurances, database* db)); + #ifdef PARTIALWORD + long look_up_partialword_in_dictionary _AP((char *word, long *word_id, database* db)); + #endif long look_up_word_in_dictionary _AP((char *word, long *word_id, database* db)); long init_dict_file_for_writing _AP((database *db)); void init_dict_file_detailed _AP((FILE* dictionary_stream, *************** *** 216,221 **** --- 263,271 ---- char* index_filename_with_version _AP((long version, char* destination, database* db)); char* source_filename _AP((char* destination, database* db)); + #ifdef BIO + char* delimiters_filename _AP((char* destination, database* db)); + #endif #ifdef __cplusplus } diff -bwcr wais-8-b5/ir/irhash.h w8b5bio/ir/irhash.h *** wais-8-b5/ir/irhash.h Mon Feb 24 22:40:22 1992 --- w8b5bio/ir/irhash.h Sat Oct 24 12:36:27 1992 *************** *** 32,38 **** --- 32,42 ---- * inverted file because the max length of an index block is governed * by a size that can be represented in INDEX_BLOCK_SIZE_SIZE bytes. */ + #ifdef BIO + #define MAX_OCCURANCES 100000L /* need 100000L, was 20000L, dgg */ + #else #define MAX_OCCURANCES 20000L + #endif /* this is a flag to be put in the number_of_occurances field of a word_entry so that it is always greater than the limit and no words will be diff -bwcr wais-8-b5/ir/irsearch.c w8b5bio/ir/irsearch.c *** wais-8-b5/ir/irsearch.c Sun May 10 16:44:33 1992 --- w8b5bio/ir/irsearch.c Sun Nov 8 19:17:21 1992 *************** *** 166,171 **** --- 166,173 ---- } } + + static long search_word_before_pairs _AP((char *word, long char_pos, long line_pos, long weight, long doc_id, time_t date, *************** *** 226,235 **** int * datum; count_trie_words = count_uniq = 0; /* printf("words: %s\n", words); */ ! /* ! word = strtokf(words,wordDelimiter); ! */ word = (char*)strtokf_isalnum(words); while(word != NULL){ long dictionary_value; /* trim the string if necessary */ --- 228,238 ---- int * datum; count_trie_words = count_uniq = 0; /* printf("words: %s\n", words); */ ! #ifdef BIO ! word = (char*)strtokf(words,wordDelimiter); ! #else word = (char*)strtokf_isalnum(words); + #endif while(word != NULL){ long dictionary_value; /* trim the string if necessary */ *************** *** 251,257 **** --- 254,264 ---- if (*datum == 1 ) { count_uniq++; } + #ifdef BIO + word = (char *)strtokf(NULL,wordDelimiter); + #else word = (char *)strtokf_isalnum(NULL); + #endif beFriendly(); } *************** *** 311,316 **** --- 318,504 ---- return result; } + + /* dgg -- pulled this from irtfiles.c:map_over_words */ + /* returns the number of words added, or -1 if an error occurred */ + long search_over_words + _AP((char* line,long document_id,database* db)); + + long search_over_words(line, document_id, db) + char* line; + long document_id; + database* db; + { + long weight = 1L; + long file_position_before_line = 0; + boolean word_position= false; + boolean word_pairs= false; + #ifdef BIO + int minwordlen= 1; /* only if symbols are active ? */ + #else + int minwordlen= 2; + #endif + + long position_in_word = 0; + long word_count = 0; + unsigned long ch; + long char_count = 0; + boolean capitalized = false; /* if the word starts with a cap */ + #ifdef LITERAL + char word[MAX_PHRASE_LENGTH + 1]; + char key; + #else + char word[MAX_WORD_LENGTH + 1]; + #endif + #ifdef BOOLEANS + #define MAX_LINE_LENGTH 1000 + boolean nextIsNot = false; + char notwords[MAX_LINE_LENGTH+1]; + #endif + + + #ifdef BOOLEANS + notwords[0]= '\0'; + #endif + + for(ch = (unsigned char)line[char_count++]; + ch != '\0'; ch = (unsigned char)line[char_count++]){ + #ifdef BIO + boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER); + #else + boolean alnum = isalnum(ch); + #endif + #ifdef PARTIALWORD + if (ch == PARTWORD_WILDCARD) { alnum= true; minwordlen= MAX(2,minwordlen); } + #endif + #ifdef LITERAL + if (ch == LITERAL_KEY1) key= LITERAL_KEY1; + else if (ch == LITERAL_KEY2) key= LITERAL_KEY2; + else key= 0; + if (key != 0) { + char *cp, *match; + cp = line + char_count; + match = strchr( cp, key); + /* printf("search_over_words: literal key is [%c]\n", key); */ + if (match != NULL && cp < match) { + for (position_in_word=0; cp < match; cp++, char_count++) + if (position_in_word < MAX_PHRASE_LENGTH) { + word[position_in_word++] = char_downcase((unsigned long)*cp); + } + char_count++; /* skip closing key */ + alnum= false; + capitalized= false; + weight= LITERAL_FLAG; /* is this a safe flag parameter? -- + unused but passed on to search_word is what we need */ + /* !! need to break literal "word" into 1st dictionary word and search + on that... */ + /* printf("search_over_words: literal is [%s]\n", word); */ + } + } + #endif + + if(alnum){ + /* put the character in the word if not too long */ + if(position_in_word == 0) + capitalized = isupper((unsigned long)ch)?true:false; + if(position_in_word < MAX_WORD_LENGTH){ + word[position_in_word++] = char_downcase((unsigned long)ch); + } + } + else{ /* not an in a word */ + if(position_in_word != 0){ + #ifdef BOOLEANS + /* note on BOOLEANS -- we really need to check for NOT words here, + and move them to back of line so that (wordfunction)== search_word is + called for NOT words after other words (excluding NOT inside a literal) + */ + if (nextIsNot) { + word[position_in_word] = '\0'; + strcat( notwords, word); + strcat( notwords, " "); + nextIsNot= false; + word_count++; + } + else if ((strncmp(word,BOOLEAN_NOT,position_in_word)==0)) { + nextIsNot= true; + word_count++; + } + else + #endif + /* then we have collected a word */ + if(position_in_word >= minwordlen){ /* is it reasonable ? */ + word[position_in_word] = '\0'; + if(0 != search_word_before_pairs(word, + file_position_before_line + char_count, + /*^^ this param is supposed to be start-of-word, but char_count is now at end-of-word !*/ + 0L, /* line_pos */ + weight, + document_id, + (time_t)0L, + capitalized, + db)) + return(-1); /* error */ + #ifdef BOOLEANS + nextIsNot= false; + #endif + word_count++; + } + position_in_word = 0; + } + } + } + + /* finish last word */ + #ifdef BOOLEANS + if (nextIsNot) { + word[position_in_word] = '\0'; + strcat( notwords, word); + strcat( notwords, " "); + nextIsNot= false; + word_count++; + } + else + #endif + if(position_in_word >= minwordlen){ /* is it reasonable ? */ + word[position_in_word] = '\0'; + if(0 != search_word_before_pairs(word, + file_position_before_line + char_count, + 0L, /* line_pos */ + weight, + document_id, + (time_t)0L, + capitalized, + db)) + return(-1); + word_count++; + } + + #ifdef BOOLEANS + if ((notwords[0] != '\0')) { + char *wordp; + capitalized= false; + char_count= 0; /* !?? need char index for each word ? */ + weight= BOOLEAN_NOT_FLAG; /* is this a safe parameter ? */ + wordp= strtok( notwords, " "); + while (wordp!=NULL) { + if(0 != search_word_before_pairs(wordp, + file_position_before_line + char_count, + 0L, /* line_pos */ + weight, + document_id, + (time_t)0L, + capitalized, + db)) + return(-1); + wordp= strtok(NULL,NULL); + } + } + #endif + + return(word_count); + } + + boolean search_for_words(words, db, doc_id) char* words; /* break the string into words (using map_over_words) *************** *** 318,324 **** Returns true if successful. */ database *db; ! long doc_id; { #ifdef BOOL --- 506,512 ---- Returns true if successful. */ database *db; ! long doc_id; /* = 1 for words == document in relevance feedback search */ { #ifdef BOOL *************** *** 338,351 **** } #endif /* def BOOL */ /* NORMAL QUERY */ ! if( -1 == map_over_words(words, doc_id, 1L, 0L, NULL, NULL, db, ! (wordfunc*)search_word_before_pairs, 0L, 0L)) return(false); else return(true); } /* gets the next best hit from the search engine and fills in all the slots. If the document does not exist, then it gets another, etc. It returns 0 if successful */ --- 526,544 ---- } #endif /* def BOOL */ + /* dgg mods really need new version of map_over_words for searching only + (not for adding == indexing), and this way we can keep main search + routines here (irsearch.c) & search_word in sersrch.c + */ + /* NORMAL QUERY */ ! if( -1 == search_over_words(words, doc_id, db)) return(false); else return(true); } + /* gets the next best hit from the search engine and fills in all the slots. If the document does not exist, then it gets another, etc. It returns 0 if successful */ *************** *** 750,755 **** --- 943,962 ---- (*diags)[1] = NULL; return(false); } + + #ifdef BIO /* dgg */ + { + char *cp= read_delimiters( db); /* use data-specific delimiters, if available */ + if (cp != NULL) { + strcpy( gDelimiters, cp); + wordDelimiter= wordbreak_user; + } + else + wordDelimiter= wordbreak_notalnum; + } + #else + wordDelimiter= wordbreak_notalnum; /* actually, wordDelimeter is used only ifdef BIO ? */ + #endif /* figure out if it is a NEXT or PREVIOUS, if so, return it. */ header = handle_next_and_previous(wais_search->Docs, db, Only in w8b5bio/ir: irsearch.c.O diff -bwcr wais-8-b5/ir/irsearch.h w8b5bio/ir/irsearch.h *** wais-8-b5/ir/irsearch.h Wed Feb 5 16:37:45 1992 --- w8b5bio/ir/irsearch.h Tue Oct 13 08:32:15 1992 *************** *** 46,51 **** --- 46,54 ---- typedef struct search { database* db; double* document_score_array; + #ifdef BOOLEANS + double* prior_score_array; /* dgg, 12/91 GS TLG */ + #endif unsigned long num_best_hits; hit* best_hits; } search; Only in w8b5bio/ir: irsearch.h.O diff -bwcr wais-8-b5/ir/irtfiles.c w8b5bio/ir/irtfiles.c *** wais-8-b5/ir/irtfiles.c Wed May 6 19:31:33 1992 --- w8b5bio/ir/irtfiles.c Sun Nov 8 19:21:43 1992 *************** *** 58,63 **** --- 58,65 ---- #include "irfiles.h" #include "irtfiles.h" + #include "ircfiles.h" /* dgg, need for genbank_header_function test */ + #ifndef THINK_C #include #include *************** *** 220,225 **** --- 222,228 ---- #endif /* def NOTUSED */ + /* MAPPING A FUNCTION OVER WORDS (QUICKLY) */ *************** *** 246,251 **** --- 249,257 ---- return(0); } + + + /* returns the number of words added, or -1 if an error occurred */ long map_over_words(line, document_id, *************** *** 255,261 **** newline_terminated, db, wordfunction, ! word_position, word_pairs) char* line; long document_id; long weight; --- 261,268 ---- newline_terminated, db, wordfunction, ! word_position, word_pairs, ! minwordlen) /* dgg */ char* line; long document_id; long weight; *************** *** 265,270 **** --- 272,278 ---- database* db; wordfunc *wordfunction; boolean word_position, word_pairs; + int minwordlen; { /* Add words to the index if it should be done. * Returns the number of words added. *************** *** 277,290 **** long position_in_word = 0; long word_count = 0; - char word[MAX_WORD_LENGTH + 1]; unsigned long ch; long char_count = 0; boolean capitalized = false; /* if the word starts with a cap */ for(ch = (unsigned char)line[char_count++]; ch != '\0'; ch = (unsigned char)line[char_count++]){ boolean alnum = isalnum(ch); if(alnum){ /* put the character in the word if not too long */ if(position_in_word == 0) --- 285,304 ---- long position_in_word = 0; long word_count = 0; unsigned long ch; long char_count = 0; boolean capitalized = false; /* if the word starts with a cap */ + char word[MAX_WORD_LENGTH + 1]; + for(ch = (unsigned char)line[char_count++]; ch != '\0'; ch = (unsigned char)line[char_count++]){ + #ifdef BIO + boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER); + #else boolean alnum = isalnum(ch); + #endif + if(alnum){ /* put the character in the word if not too long */ if(position_in_word == 0) *************** *** 296,306 **** else{ /* not an in a word */ if(position_in_word != 0){ /* then we have collected a word */ ! if(position_in_word > 1){ /* is it reasonable ? */ word[position_in_word] = '\0'; if(0 != (*wordfunction)(word, file_position_before_line + char_count, 0L, /* line_pos */ weight, document_id, --- 310,321 ---- else{ /* not an in a word */ if(position_in_word != 0){ /* then we have collected a word */ ! if(position_in_word >= minwordlen){ /* is it reasonable ? */ word[position_in_word] = '\0'; if(0 != (*wordfunction)(word, file_position_before_line + char_count, + /*^^ dgg, this param is supposed to be start-of-word, but char_count is now at end-of-word !*/ 0L, /* line_pos */ weight, document_id, *************** *** 317,323 **** } } /* finish last word */ ! if(position_in_word > 1){ /* is it reasonable ? */ word[position_in_word] = '\0'; if(0 != (*wordfunction)(word, file_position_before_line + char_count, --- 332,338 ---- } } /* finish last word */ ! if(position_in_word >= minwordlen){ /* is it reasonable ? */ word[position_in_word] = '\0'; if(0 != (*wordfunction)(word, file_position_before_line + char_count, *************** *** 353,359 **** static long add_words_if_appropriate _AP((char* line,long document_id,long weight,long file_position_before_line, long* line_length,boolean* newline_terminated,database* db, ! boolean word_position, boolean word_pairs)); static long add_words_if_appropriate(line, --- 368,375 ---- static long add_words_if_appropriate _AP((char* line,long document_id,long weight,long file_position_before_line, long* line_length,boolean* newline_terminated,database* db, ! boolean word_position, boolean word_pairs, ! int minwordlen)); static long add_words_if_appropriate(line, *************** *** 363,369 **** line_length, newline_terminated, db, ! word_position, word_pairs) char* line; long document_id; long weight; --- 379,386 ---- line_length, newline_terminated, db, ! word_position, word_pairs, ! minwordlen) /* dgg */ char* line; long document_id; long weight; *************** *** 372,377 **** --- 389,395 ---- boolean *newline_terminated; database* db; boolean word_position, word_pairs; + int minwordlen; { /* Add words to the index if it should be done. * Returns the number of words added. *************** *** 391,397 **** --- 409,419 ---- for(ch = (unsigned char)line[char_count++]; ch != '\0'; ch = (unsigned char)line[char_count++]){ + #ifdef BIO + boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER); + #else boolean alnum = isalnum(ch); + #endif if(alnum){ /* put the character in the word if not too long */ if(position_in_word == 0) *************** *** 403,409 **** else{ /* not an in a word */ if(position_in_word != 0){ /* then we have collected a word */ ! if(position_in_word > 1){ /* is it reasonable ? */ word[position_in_word] = '\0'; add_word_before_pairs(word, file_position_before_line + char_count, --- 425,431 ---- else{ /* not an in a word */ if(position_in_word != 0){ /* then we have collected a word */ ! if(position_in_word >= minwordlen){ /* is it reasonable ? */ word[position_in_word] = '\0'; add_word_before_pairs(word, file_position_before_line + char_count, *************** *** 421,427 **** } } /* finish last word */ ! if(position_in_word > 1){ /* is it reasonable ? */ word[position_in_word] = '\0'; add_word(word, file_position_before_line + char_count, --- 443,449 ---- } } /* finish last word */ ! if(position_in_word >= minwordlen){ /* is it reasonable ? */ word[position_in_word] = '\0'; add_word(word, file_position_before_line + char_count, *************** *** 468,483 **** #define iterations_to_reorder 50 /* 1 is best but slow */ static void finish_document ! _AP((char* header,char* line,long document_id, document_table_entry* the_document_table_entry, long file_position_before_line, long file_position_before_document,database* db, ! boolean word_position, boolean word_pairs)); static void ! finish_document(header,line,document_id,the_document_table_entry, file_position_before_line, file_position_before_document, ! db, word_position, word_pairs) char* header; char* line; long document_id; --- 490,508 ---- #define iterations_to_reorder 50 /* 1 is best but slow */ static void finish_document ! _AP((boolean recountHeader, char* header,char* line,long document_id, document_table_entry* the_document_table_entry, long file_position_before_line, long file_position_before_document,database* db, ! boolean word_position, boolean word_pairs, ! int minwordlen)); static void ! finish_document(recountHeader, header,line,document_id,the_document_table_entry, file_position_before_line, file_position_before_document, ! db, word_position, word_pairs, ! minwordlen) ! boolean recountHeader; char* header; char* line; long document_id; *************** *** 488,494 **** boolean word_position, word_pairs; { long line_length; boolean newline_terminated; ! if(0 != strlen(header)){ /* add weights for the header (if there was one) */ long number_of_words = map_over_words(header, document_id, --- 513,519 ---- boolean word_position, word_pairs; { long line_length; boolean newline_terminated; ! if(0 != strlen(header) && recountHeader){ /* add weights for the header (if there was one) */ long number_of_words = map_over_words(header, document_id, *************** *** 499,505 **** &newline_terminated, db, add_word_before_pairs, ! word_position, word_pairs); if(number_of_words == -1) waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed"); db->total_word_count += number_of_words; --- 524,531 ---- &newline_terminated, db, add_word_before_pairs, ! word_position, word_pairs, ! minwordlen); if(number_of_words == -1) waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed"); db->total_word_count += number_of_words; *************** *** 615,621 **** #define LENGTH_OF_NEWLINE 1 /* this will be 2 on a PC, I think */ ! void index_text_file(filename, separator_function, header_function, date_function, --- 641,647 ---- #define LENGTH_OF_NEWLINE 1 /* this will be 2 on a PC, I think */ ! /* void index_text_file(filename, separator_function, header_function, date_function, *************** *** 624,636 **** --- 650,670 ---- db, check_for_text_file, check_for_file_already_indexed, + word_position, word_pairs, minwordlen) */ + + void index_text_file(filename, dataops, db, + check_for_text_file, + check_for_file_already_indexed, word_position, word_pairs) char* filename; + dataopsrec* dataops; + /* boolfunc *separator_function; voidfunc *header_function; longfunc *date_function; voidfunc *finish_header_function; char *type; + */ database* db; boolean check_for_text_file; boolean check_for_file_already_indexed; *************** *** 667,672 **** --- 701,707 ---- long file_position_before_document = 0; long date; + if(NULL == input_stream){ waislog(WLOG_HIGH, WLOG_ERROR, "File %s does not exist", filename); *************** *** 677,683 **** time_t time; char full_path[MAX_FILENAME_LEN]; truename(filename, full_path); ! if(true == filename_in_database(full_path, type, &time, db)){ /* check that it is the same time as this file */ if(time == file_write_date(filename)){ waislog(WLOG_HIGH, WLOG_INDEX, --- 712,718 ---- time_t time; char full_path[MAX_FILENAME_LEN]; truename(filename, full_path); ! if(true == filename_in_database(full_path, dataops->type, &time, db)){ /* check that it is the same time as this file */ if(time == file_write_date(filename)){ waislog(WLOG_HIGH, WLOG_INDEX, *************** *** 712,718 **** } /* write out the filename */ ! filename_id = write_filename_table_entry(filename, type, db); /* (if (not *drop_table*) (make_drop_table)) maybe put in later */ --- 747,753 ---- } /* write out the filename */ ! filename_id = write_filename_table_entry(filename, dataops->type, db); /* (if (not *drop_table*) (make_drop_table)) maybe put in later */ *************** *** 743,756 **** header[0] = '\0'; /* set it to the empty string */ if(eof || ! ((NULL != separator_function) && ! separator_function(line))){ /* we are processing a separator, therefore we should * finish off the last document, and start a new one */ ! if(NULL != finish_header_function){ ! finish_header_function(header); } if(0 == strlen(header)){ char full_path[1000]; --- 778,790 ---- header[0] = '\0'; /* set it to the empty string */ if(eof || ! ((NULL != dataops->separator_function) && dataops->separator_function(line))){ /* we are processing a separator, therefore we should * finish off the last document, and start a new one */ ! if(NULL != dataops->finish_header_function){ ! dataops->finish_header_function(header); } if(0 == strlen(header)){ char full_path[1000]; *************** *** 761,772 **** } the_document_table_entry.number_of_lines--; /* dont count separator */ /* finish off the last */ ! finish_document(header, line, document_id, &the_document_table_entry, eof? /* if EOF, use file length */ file_length(input_stream):file_position_before_line, file_position_before_document, ! db, word_position, word_pairs); /* initialize the next one */ the_document_table_entry.filename_id = filename_id; the_document_table_entry.start_character = file_position_before_line; --- 795,808 ---- } the_document_table_entry.number_of_lines--; /* dont count separator */ /* finish off the last */ ! finish_document( dataops->extraheaderweight, ! header, line, document_id, &the_document_table_entry, eof? /* if EOF, use file length */ file_length(input_stream):file_position_before_line, file_position_before_document, ! db, word_position, word_pairs, ! dataops->minwordlen); /* initialize the next one */ the_document_table_entry.filename_id = filename_id; the_document_table_entry.start_character = file_position_before_line; *************** *** 778,792 **** if(!eof) { /* not EOF */ ! if(NULL != header_function){ ! header_function(line); } ! if (date_function != NULL && ! (date = date_function(line)) > 0) the_document_table_entry.date = date; line_length = strlen(line); newline_terminated = true; } else{ /* EOF */ /* printf("closing the file\n"); */ s_fclose(input_stream); --- 814,847 ---- if(!eof) { /* not EOF */ ! if(NULL != dataops->header_function){ ! dataops->header_function(line); } ! if (dataops->date_function != NULL && ! (date = dataops->date_function(line)) > 0) the_document_table_entry.date = date; + /* dgg -- don't know where this goes. */ + + if (dataops->addseparatorwords) { /* dgg */ + long number_of_words; + number_of_words = map_over_words(line, document_id, dataops->repeat_weight, + file_position_before_line - + file_position_before_document, + &line_length, + &newline_terminated, + db, + add_word_before_pairs, + word_position, word_pairs, + dataops->minwordlen); + the_document_table_entry.document_length += number_of_words; + len_of_files_since_last_delete += number_of_words; + len_of_files_since_last_flush += number_of_words; + } + else { line_length = strlen(line); newline_terminated = true; } + } else{ /* EOF */ /* printf("closing the file\n"); */ s_fclose(input_stream); *************** *** 797,812 **** else{ /* not a separator or EOF so process the line */ long number_of_words; ! if(NULL != header_function) header_function(line); ! if (date_function != NULL && the_document_table_entry.date == 0 && ! (date = date_function(line)) > 0) the_document_table_entry.date = date; if(index_contents ) { if( _indexable_section) { ! number_of_words = map_over_words(line, document_id, 1L, file_position_before_line - file_position_before_document, &line_length, --- 852,867 ---- else{ /* not a separator or EOF so process the line */ long number_of_words; ! if (dataops->date_function != NULL && the_document_table_entry.date == 0 && ! (date = dataops->date_function(line)) > 0) the_document_table_entry.date = date; + if(NULL != dataops->header_function) dataops->header_function(line); if(index_contents ) { if( _indexable_section) { ! number_of_words = map_over_words(line, document_id, dataops->repeat_weight, file_position_before_line - file_position_before_document, &line_length, *************** *** 813,819 **** &newline_terminated, db, add_word_before_pairs, ! word_position, word_pairs); if(number_of_words == -1) waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed"); the_document_table_entry.document_length += number_of_words; --- 868,875 ---- &newline_terminated, db, add_word_before_pairs, ! word_position, word_pairs, ! dataops->minwordlen); if(number_of_words == -1) waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed"); the_document_table_entry.document_length += number_of_words; *************** *** 884,905 **** /* recursively indexes the directory specified. * If it is a file, then index it. */ ! void index_directory(file, ! separator_function, ! header_function, ! date_function, ! finish_header_function, ! type, ! db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs) char *file; ! boolfunc *separator_function; ! voidfunc *header_function; ! longfunc *date_function; ! voidfunc *finish_header_function; ! char *type; database* db; boolean check_for_text_file; boolean check_for_file_already_indexed; --- 940,951 ---- /* recursively indexes the directory specified. * If it is a file, then index it. */ ! void index_directory(file, dataops, db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs) char *file; ! dataopsrec* dataops; database* db; boolean check_for_text_file; boolean check_for_file_already_indexed; *************** *** 912,923 **** if(filep(file)){ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", file); ! index_text_file(file, separator_function, ! header_function, ! date_function, ! finish_header_function, ! type, ! db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs); --- 958,964 ---- if(filep(file)){ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", file); ! index_text_file(file, dataops, db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs); *************** *** 937,948 **** strcpy(name, file); /* copy the filename into the name variable */ strcat(name, "/"); strcat(name, list[j]->d_name); ! index_directory(name, separator_function, ! header_function, ! date_function, ! finish_header_function, ! type, ! db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs); --- 978,984 ---- strcpy(name, file); /* copy the filename into the name variable */ strcat(name, "/"); strcat(name, list[j]->d_name); ! index_directory(name, dataops, db, check_for_text_file, check_for_file_already_indexed, word_position, word_pairs); diff -bwcr wais-8-b5/ir/irtfiles.h w8b5bio/ir/irtfiles.h *** wais-8-b5/ir/irtfiles.h Wed May 6 19:32:15 1992 --- w8b5bio/ir/irtfiles.h Sun Nov 8 19:22:11 1992 *************** *** 19,24 **** --- 19,27 ---- { #endif /* def __cplusplus */ + #define kWordBreakNonAlnum 0 + #define kWordBreakSpace 1 + typedef boolean (boolfunc)(); typedef void (voidfunc)(); typedef long (longfunc)(); *************** *** 25,30 **** --- 28,50 ---- char *make_joint_word _AP((char *word1, char *word2)); + typedef struct { /* dgg, use this instead of a dozen parameters in several funcs */ + boolean (*separator_function)(); + void (*header_function)(); + long (*date_function)(); + void (*finish_header_function)(); + boolean (*wordDelimiter)(); /* ? use global wordDelimiter ? */ + char *type; /* TEXT, PICT, ... */ + boolean addseparatorwords; /* add words in line separating documents */ + boolean extraheaderweight; /* add header weight */ + int repeat_weight; /* 0 for max of 1 word hit per doc */ + int minwordlen; /* 2= default, 1=symbols */ + char *delimiters; + } dataopsrec, *dataopsptr; + + boolean (*wordDelimiter)(); /* global word break func, dgg */ + + typedef long (wordfunc) _AP((char *word, long char_pos, long line_pos, long weight, long doc_id, time_t date, *************** *** 34,42 **** long map_over_words _AP((char* line,long document_id,long weight,long file_position_before_line, long* line_length,boolean* newline_terminated,database* db, ! wordfunc wordfunction, boolean word_position, boolean word_pairs)); void index_text_file _AP((char* filename, boolean (*separator_function)(), void (*header_function)(), longfunc *date_function, --- 54,70 ---- long map_over_words _AP((char* line,long document_id,long weight,long file_position_before_line, long* line_length,boolean* newline_terminated,database* db, ! wordfunc wordfunction, boolean word_position, boolean word_pairs, ! int minwordlen)); void index_text_file _AP((char* filename, + dataopsrec* dataops, + database* db, + boolean check_for_text_file, + boolean check_for_file_already_indexed, + boolean word_position, boolean word_pairs)); + + /* void index_text_file _AP((char* filename, boolean (*separator_function)(), void (*header_function)(), longfunc *date_function, *************** *** 45,51 **** database* db, boolean check_for_text_file, boolean check_for_file_already_indexed, ! boolean word_position, boolean word_pairs)); boolean directoryp _AP ((char *file)); --- 73,80 ---- database* db, boolean check_for_text_file, boolean check_for_file_already_indexed, ! boolean word_position, boolean word_pairs, ! int minwordlen)); */ boolean directoryp _AP ((char *file)); *************** *** 52,57 **** --- 81,93 ---- boolean filep _AP((char* file)); void index_directory _AP ((char *file, + dataopsrec* dataops, + database *db, + boolean check_for_text_file, + boolean check_for_file_already_indexed, + boolean word_position, boolean word_pairs)); + + /* void index_directory _AP ((char *file, boolfunc *separator_function, voidfunc *header_function, longfunc *date_function, *************** *** 60,66 **** database *db, boolean check_for_text_file, boolean check_for_file_already_indexed, ! boolean word_position, boolean word_pairs)); #ifdef __cplusplus } --- 96,103 ---- database *db, boolean check_for_text_file, boolean check_for_file_already_indexed, ! boolean word_position, boolean word_pairs, ! int minwordlen)); */ #ifdef __cplusplus } Only in w8b5bio/ir: iubuild.c diff -bwcr wais-8-b5/ir/macbuild.c w8b5bio/ir/macbuild.c *** wais-8-b5/ir/macbuild.c Wed Feb 12 16:34:10 1992 --- w8b5bio/ir/macbuild.c Sun Nov 8 19:23:40 1992 *************** *** 187,195 **** --- 187,212 ---- long count; SFReply macSFReply; Point pos; + dataopsrec dataops; InitProfile(3000,200); + /* dgg -- must duplicate mods to irbuild.c */ + dataops.separator_function= NULL; + dataops.header_function= NULL; + dataops.date_function= NULL; + dataops.finish_header_function= NULL; + dataops.type= "TEXT"; + dataops.wordDelimiter= wordbreak_notalnum; + dataops.addseparatorwords= false; + dataops.extraheaderweight= true; + dataops.repeat_weight= 1; + dataops.minwordlen= 2; + stop_list_file("\0"); + gDelimiters[0]= '\0'; + wordDelimiter= wordbreak_notalnum; + /* dgg -- end new inits */ + _profile = false; _trace = false; freopen("stdout","w",stdout); *************** *** 261,270 **** if(!(PBPtr->hFileInfo.ioFlAttrib & 0x10)) { printf("Indexing File: %s...\n", filename); ! index_text_file(filename, ! NULL, NULL, NULL, ! "TEXT", /* this should be the mac filetype XXX */ ! db,true, false); } else printf("Directory %s is not indexed\n", filename); --- 278,284 ---- if(!(PBPtr->hFileInfo.ioFlAttrib & 0x10)) { printf("Indexing File: %s...\n", filename); ! index_text_file(filename, &dataops, db,true, false); } else printf("Directory %s is not indexed\n", filename); diff -bwcr wais-8-b5/ir/sersrch.c w8b5bio/ir/sersrch.c *** wais-8-b5/ir/sersrch.c Tue Apr 28 19:00:13 1992 --- w8b5bio/ir/sersrch.c Thu Oct 29 19:59:32 1992 *************** *** 36,41 **** --- 36,44 ---- #include "cdialect.h" #include "irfiles.h" + #ifdef BIO + #include "irtfiles.h" /* dgg, for wordDelimiter */ + #endif #include "irsearch.h" #include "irext.h" #include "byte_order.h" *************** *** 170,175 **** --- 173,181 ---- double *document_score_array = NULL; long document_score_array_len = 0; + #ifdef BOOLEANS + double *prev_score_array = NULL; /* 12/91 GS TLG */ + #endif /* make_document_score_array insures that the document_score_array array is long enough, if not it makes it long enough */ *************** *** 182,190 **** --- 188,202 ---- /* we have to make a new one. free the old one first (if any) */ if(document_score_array != 0){ s_free(document_score_array); + #ifdef BOOLEANS + s_free(prev_score_array); /* 12/91 GS TLG */ + #endif } document_score_array = (double*)s_malloc( (size_t)(length * sizeof(double))); + #ifdef BOOLEANS + prev_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); /* 12/91 GS TLG */ + #endif document_score_array_len = length; } *************** *** 192,197 **** --- 204,212 ---- static void destroy_document_score_array() { s_free(document_score_array); + #ifdef BOOLEANS + s_free(prev_score_array); /* 12/91 GS TLG */ + #endif document_score_array_len = 0; } *************** *** 200,205 **** --- 215,224 ---- { memset(document_score_array, 0, document_score_array_len * sizeof(double)); + #ifdef BOOLEANS + memset(prev_score_array, 0, /* 12/91 GS TLG */ + document_score_array_len * sizeof(double)); /* 12/91 GS TLG */ + #endif } /* for debugging purposes */ *************** *** 420,425 **** --- 439,450 ---- return(0); } + + #ifdef BOOLEANS + static boolean gLastAnd= false; + static boolean gLastNot= false; + #endif + /* see irext.h for doc */ long search_word(word,char_pos, line_pos, weight, doc_id, word_pair, db) *************** *** 452,461 **** --- 477,557 ---- char *i; FILE *stream = NULL; + #ifdef LITERAL + long txt_pos, icnt, wcnt, pcnt; /* 2/92 GS TLG */ + document_table_entry doc_entry; /* 2/92 GS TLG */ + static FILE *txt_stream = NULL; /* 2/92 GS TLG */ + char cmpr_word[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */ + char phrase[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */ + char txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ + char prev_txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ + char txt_type[MAX_TYPE_LEN + 1]; /* 2/92 GS TLG */ + long phraselen= 0, txt_pos_fix= 0; + #endif + + #ifdef LITERAL + if (weight==LITERAL_FLAG) { + /* goto after_booleans */ + /* printf("search_word: literal word is [%s]\n", word); */ + } + else + #endif + #ifdef BOOLEANS + if (strcmp(word,BOOLEAN_AND)==0) { /* should be all lowercase cmp here */ + gLastAnd= true; + return(0); + } + else if (strcmp(word,BOOLEAN_NOT)==0) { + /* ^^ this is bad if we intersperse "not"s in a query -- + docs found after not word may include notted word -- + need to go back to doing not words after others -- + but need now to check for literal string first + */ + gLastNot= true; + return(0); + } + if (weight == BOOLEAN_NOT_FLAG) gLastNot= true; + #else + ; /* if not LITERAL_FLAG */ + #endif + index_buffer = (char*)index_buffer_data; + #ifdef LITERAL + if (weight==LITERAL_FLAG) { + /* note: we found the first word of phrase once in map_over_words, but i'm too lazy + to put another parameter in that cascade of function calls it takes + to get here. + */ + char word1[MAX_WORD_LENGTH + 1]; + register int i, len; + register boolean more; + phraselen= MIN( MAX_PHRASE_LENGTH, strlen(word)); + len = MIN( MAX_WORD_LENGTH, phraselen); + for (i=0, more=true; i < len && more; ) { + word1[i] = word[i++]; + #ifdef BIO + more= (wordDelimiter(word[i]) == NOT_DELIMITER); + #else + more= (isalnum(word[i])); + #endif + } + word1[i]= '\0'; + txt_pos_fix= strlen(word1) + 1; + /* printf("search_word: literal word1 is [%s]\n", word1); */ index_file_block_number = + look_up_word_in_dictionary(word1, &number_of_occurances, db); + } + else + #endif /* LITERAL */ + + #ifdef PARTIALWORD + index_file_block_number = + look_up_partialword_in_dictionary(word, &number_of_occurances, db); + #else + index_file_block_number = look_up_word_in_dictionary(word, &number_of_occurances, db); + #endif current_best_hit = 0; /* so that the best hits willstart from 0 */ *************** *** 464,469 **** --- 560,568 ---- make_document_score_array(db->doc_table_allocated_entries); if(index_file_block_number >= 0){ + #ifdef PARTIALWORD + while(index_file_block_number > 0){ /* dgg, need 2nd loop here for multiple partwords */ + #endif stream = db->index_stream; while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && *************** *** 475,480 **** --- 574,582 ---- waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the inverted file to position %ld", (long)index_file_block_number); + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif return(-1); } *************** *** 511,516 **** --- 613,621 ---- { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the index file failed"); + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif return(-1); } *************** *** 526,531 **** --- 631,639 ---- waislog(WLOG_HIGH, WLOG_ERROR, "Expected the flag in the inverted file to be valid. it is %ld", not_full_flag); + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif return(-1); } /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */ *************** *** 547,552 **** --- 655,670 ---- DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE); ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0); + #ifdef LITERAL + /* dgg -- is this proper update of read form to ASSIGN form ??*/ + /* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/ /* 2/92 GS TLG */ + if ((weight == LITERAL_FLAG) && (0 == doc_id)) { + ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE, + INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE); + /* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */ + } + #endif + internal_weight = wgt; internal_document_id = did; *************** *** 559,581 **** { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the doc-id table failed"); return(-1); } /* if(doc_id > 0) we are doing a relevant document */ document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; i+=INDEX_ELEMENT_SIZE; } } return(0); } else if(0 == index_file_block_number){ /* an error occurred on looking up the word */ return(-1); } ! else /* index_file_block_number is negative */ return(0); /* word not present */ } /* now collect the best hits */ long finished_search_word(db) --- 677,783 ---- { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the doc-id table failed"); + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif return(-1); } + + #ifdef LITERAL + if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* 2/92 GS TLG */ + if (true == read_document_table_entry(&doc_entry, /* 2/92 GS TLG */ + internal_document_id, db)) /* 2/92 GS TLG */ + { /* 2/92 GS TLG */ + read_filename_table_entry(doc_entry.filename_id, /* 2/92 GS TLG */ + txt_filename, txt_type, NULL, db); /* 2/92 GS TLG */ + /* printf("search_word: document is [%s]\n", txt_filename); */ + if (NULL == txt_stream) { + txt_stream = s_fopen(txt_filename, "rb"); + strcpy(prev_txt_filename, txt_filename); + } + else if (0 != strcmp(txt_stream, prev_txt_filename)) { + s_fclose(txt_stream); + txt_stream = s_fopen(txt_filename, "rb"); + strcpy(prev_txt_filename, txt_filename); /* 2/92 GS TLG */ + } + + txt_pos += doc_entry.start_character - txt_pos_fix; /* dgg */ + s_fseek(txt_stream, txt_pos, SEEK_SET); /* 2/92 GS TLG */ + fread(phrase, 1L, phraselen, txt_stream); /* 2/92 GS TLG */ + /* { phrase[phraselen]= '\0'; + printf("search_word: file phrase is [%s]\n", phrase); + } */ + if (0 != strncasecmp(word, phrase, phraselen)) /* 2/92 GS TLG */ + internal_weight = 0; /* 2/92 GS TLG */ + } + } /* 2/92 GS TLG */ + #endif + + #ifdef BOOLEANS + if (gLastNot) { + document_score_array[internal_document_id] = 0; + /* printf("search_word: boolean 'not' scored\n"); */ + } + else + #endif + { /* if(doc_id > 0) we are doing a relevant document */ document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; + } + i+=INDEX_ELEMENT_SIZE; } } + + #ifdef PARTIALWORD + index_file_block_number = + look_up_partialword_in_dictionary(NULL, &number_of_occurances, db); + } + #endif + + + #ifdef BOOLEANS + for (count=0; count < db->doc_table_allocated_entries; count++) { /* 12/91 GS TLG */ + if (!gLastAnd) { /* 12/91 GS TLG */ + prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */ + } /* 12/91 GS TLG */ + else { /* 12/91 GS TLG */ + if ((document_score_array[count] == prev_score_array[count]) /* 12/91 GS TLG */ + || (prev_score_array[count] == 0)) { + document_score_array[count] = 0; /* 12/91 GS TLG */ + prev_score_array[count] = 0; /* 12/91 GS TLG */ + } /* 12/91 GS TLG */ + else { + prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */ + } /* 12/91 GS TLG */ + } /* 12/91 GS TLG */ + } /* 12/91 GS TLG */ + /* if (gLastAnd) printf("search_word: boolean `and' scored\n"); */ + #endif + + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif /* BOOLEANS */ return(0); } + else if(0 == index_file_block_number){ /* an error occurred on looking up the word */ + #ifdef BOOLEANS + gLastNot= gLastAnd= false; + #endif return(-1); } ! ! else { /* index_file_block_number is negative */ ! #ifdef BOOLEANS ! gLastNot= gLastAnd= false; ! #endif return(0); /* word not present */ } + } + /* now collect the best hits */ long finished_search_word(db) Only in w8b5bio/ir: sersrch.c.O diff -bwcr wais-8-b5/ir/server-single.c w8b5bio/ir/server-single.c *** wais-8-b5/ir/server-single.c Sun May 10 16:49:04 1992 --- w8b5bio/ir/server-single.c Sun Nov 8 19:27:49 1992 *************** *** 399,408 **** --- 399,425 ---- extern char *sys_errlist[]; char host_name[255]; static long current_id = 1, current_log_line = 0; + dataopsrec dataops; command_name = next_argument; host_name[0] = 0; + /* dgg -- must duplicate mods to irbuild.c, here is mini-build of INFO.src */ + dataops.separator_function= NULL; + dataops.header_function= NULL; + dataops.date_function= NULL; + dataops.finish_header_function= NULL; + dataops.type= "WSRC"; + dataops.wordDelimiter= wordbreak_notalnum; + dataops.addseparatorwords= false; + dataops.extraheaderweight= true; + dataops.repeat_weight= 1; + dataops.minwordlen= 2; + stop_list_file("\0"); + gDelimiters[0]= '\0'; + wordDelimiter= wordbreak_notalnum; + /* dgg -- end new inits */ + getitimer(ITIMER_REAL, &old); for(i = 0; i < 200; i++) clients[i].file = NULL; *************** *** 555,562 **** strncat(filename, (*s)->d_name, MAX_FILENAME_LEN); waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing %s", filename); ! index_text_file(filename, NULL, NULL, NULL, ! NULL, "WSRC", db, true, false); s++; } freedir(list); /* array of filenames */ --- 572,578 ---- strncat(filename, (*s)->d_name, MAX_FILENAME_LEN); waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing %s", filename); ! index_text_file(filename, &dataops, db, true, false); s++; } freedir(list); /* array of filenames */ diff -bwcr wais-8-b5/ir/server.c w8b5bio/ir/server.c *** wais-8-b5/ir/server.c Sun May 10 16:47:38 1992 --- w8b5bio/ir/server.c Mon Nov 9 10:10:52 1992 *************** *** 409,414 **** --- 409,415 ---- extern char *sys_errlist[]; char host_name[255], host_address[255]; extern void filename_finish_header_function(); + dataopsrec dataops; #ifdef SET_LIMIT struct rlimit rlp; *************** *** 418,423 **** --- 419,440 ---- setrlimit(RLIMIT_CORE, &rlp); #endif + /* dgg -- must duplicate mods to irbuild.c, here is mini-build of INFO.src */ + dataops.separator_function= NULL; + dataops.header_function= NULL; + dataops.date_function= NULL; + dataops.finish_header_function= filename_finish_header_function; + dataops.type= "WSRC"; + dataops.wordDelimiter= wordbreak_notalnum; + dataops.addseparatorwords= false; + dataops.extraheaderweight= true; + dataops.repeat_weight= 1; + dataops.minwordlen= 2; + stop_list_file("\0"); + gDelimiters[0]= '\0'; + wordDelimiter= wordbreak_notalnum; + /* dgg -- end new inits */ + tcp_port = 210; /* tcp_port to use */ command_name = next_argument; host_name[0] = 0; *************** *** 439,445 **** peer = gethostbyaddr(&source.sin_addr, 4, AF_INET); ! if(peer != NULL) sprintf(host_name, "%s", peer->h_name); sprintf(host_address, "%s", --- 456,462 ---- peer = gethostbyaddr(&source.sin_addr, 4, AF_INET); ! if(peer != NULL) { sprintf(host_name, "%s", peer->h_name); sprintf(host_address, "%s", *************** *** 450,455 **** --- 467,473 ---- #endif /* sparc */ ); } + } else sprintf(host_address, "Error getting socket: %d, %s.", errno, sys_errlist[errno]); use_stdio = TRUE; *************** *** 617,625 **** strncat(filename, (*s)->d_name, MAX_FILENAME_LEN); waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing %s", filename); ! index_text_file(filename, NULL, NULL, NULL, ! filename_finish_header_function, ! "WSRC", db, true, false, false, true); s++; } --- 635,641 ---- strncat(filename, (*s)->d_name, MAX_FILENAME_LEN); waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing %s", filename); ! index_text_file(filename, &dataops, db, true, false, false, true); s++; } Only in w8b5bio/ir: server.c.orig diff -bwcr wais-8-b5/ir/sockets.c w8b5bio/ir/sockets.c *** wais-8-b5/ir/sockets.c Wed May 6 19:34:00 1992 --- w8b5bio/ir/sockets.c Mon Nov 9 10:10:52 1992 *************** *** 187,192 **** --- 187,193 ---- peer = gethostbyaddr(&source.sin_addr, 4, AF_INET); + if(peer != NULL) { waislog(WLOG_MEDIUM, WLOG_CONNECT, "Accepted connection from: %s [%s]", peer->h_name, *************** *** 196,201 **** --- 197,213 ---- inet_ntoa(source.sin_addr) #endif /* sparc */ ); + } + else { + waislog(WLOG_MEDIUM, WLOG_CONNECT, + "Accepted connection from: [%s]", + #if defined(sparc) && defined(__GNUC__) + inet_ntoa(&source.sin_addr) + #else + inet_ntoa(source.sin_addr) + #endif /* sparc */ + ); + } } if (*fd < 0) panic("can't accept connection"); Only in w8b5bio/ir: sockets.c.orig diff -bwcr wais-8-b5/ir/stoplist.c w8b5bio/ir/stoplist.c *** wais-8-b5/ir/stoplist.c Wed Feb 12 16:49:08 1992 --- w8b5bio/ir/stoplist.c Thu Oct 29 20:01:49 1992 *************** *** 23,29 **** --- 23,31 ---- #include "stoplist.h" #include "cutil.h" + #include + static char stoplist_filename[MAX_FILENAME_LEN]; /* dgg */ long stoplist_pointer = 0; char *stoplist[] = { "a", *************** *** 416,423 **** --- 418,468 ---- "yourselves", "z", NULL}; + long nstops = 0; + char **stop_from_file = NULL; /* dgg */ + char **stop_ptr; /* dgg */ + typedef char (*charptr); + + void stop_list_file(filename) + char *filename; + { + strcpy(stoplist_filename, filename); + } + void init_stop_list() { + #ifdef BIO + if (nstops>0 && stop_from_file != NULL) { /* dgg */ + int i; + for (i=0; i