#!/usr/bin/perl ############################################################################# # PERL Program "fnaindex.pl" # # March 28th, 1998 # # AUTHORS # Anton Chuppin # Geoff Gerrietts # Eric Larson # John laPlante # # PURPOSE # This program is designed to test parsing logic for HTML pages derived form # the Flora of North America (FNA). # # FNA STRUCTURE # The principal structure being tested in this program is the usefullness of # using words printed in BOLD as a mark-up langauge for searching web pages # derived from the FNA. It was determined that the morphological descriptions # have a reasonably controlled vocabulary when describing the distinctive # features of the genus or species. The goal of this program is to determine # whether using BOLD words as mark-up delimiters is useful. Usually, the # bold descriptor (leaves, bark stems) was followed by descriptive text. The # most useful text was found to be in one paragraph (ended by a

html # markup tag. # # BOLDWORD FILE # This program requires a list of bold words sifted from the collection. A # program called boldword.pl is used to create the needed text file. The # boldword.pl program sifts out all word delimited by bold tags and stores # them into a file. In tests, approximately 3.5 megabytes of text were # sifted into 100 lines of bold words. The resultant file was cut to # approximately 35 by manual processing to eliminate obviously errant # and non-descriptive words. # # STOPWORD FILE # A simple stop word file is also used by the program. Each stop word # is on one-line. # # COMMAND-LINE SYNTAX # index.pl [-d] [-i in-dir] [-o out-dir] [-b bold-wrd] [-s stop-wrd] # index.pl is the script itself # -d turns on debugging (default off) # -i is the input directory (default 'fnadata') # -o is the output directory (default 'indexes') # -b is the boldword file (default 'bold.wrd') # -s is the stopword file (default 'stop.wrd') # # OUTPUT FILE # returns gdbm databases equal in count to the number of boldwords # ################################################################### # Include file for GDBM database and initialization of the GDBM database. # the use line. Place at top of script. use GDBM_File; use Getopt::Std; require '/usr2/people/lis429/public_html/stem.pl'; unless (getopts('i:o:s:b:hd')) { print STDERR "Illegal option.\n"; die(&printusage()); } # Enable better file writing (hopefully) umask(000); # Variable for default gdbm database where no indexable term is found at # the start of the line. $opentext = "open_text" ; # Test for proper command line, exit if not if ($opt_h) { print &printusage(); exit(0); } $debugging = ($opt_d || 0); $stopfile = ($opt_s || "stop.wrd"); $boldfile = ($opt_b || "boldword.sort"); $subdir = ($opt_i || "fnaprocess"); if ($subdir !~ /\/$/) { $subdir .= '/'; } $outdir = ($opt_o || "indexes"); if ($outdir !~/\/$/) { $outdir .= '/'; } # test for simple failures early on unless (((-d $outdir)&&(-w $outdir)) || mkdir($outdir,0775)) { die ("$0 died -- output directory $outdir could not be created or written to\n"); } unless (open (STOPFILE, "$stopfile")) { # Pick-up stop-file die ("$0 Died -- Stop file \"$stopfile\" could not be input\n") ; } unless (open (BOLDFILE, "$boldfile")) { # Pick-up boldword file die ("$0 Died -- Boldword file \"$boldfile\" could not be input\n") ; } unless (opendir (DATADIR, $subdir)) { ; # Open up the data sub-directory die "$0 Died -- $subdir data input sub-dir cannot be opened" ; } # read in stopwords while ($stopword = ) { # Read a line of text, parse, translate, save chop($stopword) ; # Remove the ending newline $stopword =~ tr/A-Z/a-z/ ; # Convert to lowercase $stopword =~ tr/-/ / ; # Convert -'s to spaces $stopword =~ s/ +//g ; # Remove blanks if ($stopword ne "") { # Don't create NULL entries $stoplist{"$stopword"} = 1 ; # Create/fill Assoc array } } close STOPFILE ; # Read in boldwords -- the goal is to create an associative array where # the "to be tested boldword" is the hash key, and the value is the name of # the gdbm file that is supposed to hold the data. There can be multiple # copies of the "value," but there should only be one instance of the "key." # The structure of the boldword file is: # test_word \t gdbm_name \n # with no spaces. The list needs to be in small letters and should use # underlines for multi-word boldwords. The processing done by fnaprocess.pl # should ensure that each boldword already has the underlines and that the # first word on the line is the candidate boldword. A good example is the # boldword.sort that was created by hand and should be in the same sub-dir # as this code. Beware of the if test that looks to see if the word parsed # from the file is a key to this associative array -- I don't know for sure # how to test this. while ($boldword = ) { # Read a line of text, parse, translate, save chomp($boldword) ; # Remove the ending newline $boldword =~ s/ +//g ; # Remove blanks @Boldinput = split(/\t/, $boldword); # Split on the tab character if ($Boldinput[0] ne "") { # Don't create NULL entries $boldlist{$Boldinput[0]} = $Boldinput[1] ; # Create/fill Assoc array } } close BOLDFILE ; # Call the sub to create the gdbm databases. $db = &database_start($outdir, values(%boldlist)) ; while ($filelist = readdir(DATADIR)) { $readfile = $subdir . $filelist ; unless (open(INFILE, "$readfile")) { # Open the input file print STDERR "$0 Error: ". "$filelist input file cannot be opened -- skipping\n" ; } # unless (open (OUTFILE, ">>$outfile")) { # Open the output file # if (!(-w $outfile)) { # die ("$0 Died -- No write permissions to $outfile \n") ; # } else { # die ("$0 Died -- Outfile $outfile cannot be opened \n") ; # } # } while ($line = ) { chomp($line) ; $line =~ tr/A-Z/a-z/; $line =~ tr/-/_/ ; $line =~ s/\ \;/_/ ; $line =~ s/\<.*\>/ /g ; my(@WordList,@indexlist) = (); @WordList = split(/\s+/, $line); $indexname= shift(@WordList) ; foreach $Word (@WordList) { # Inspect each word if (!($stoplist{$Word} eq "1")) { # Test word to stoplist $Word =~ s/ +//g; $Word =~ s/\W//g; if($Word ne ""){ push (@indexlist, $Word); } } } #************************************************************************* # The "if ($boldlist{$indexname})" IF test is untested code -- beware of it # The goal is to test the key, then save to the gdbm file that is the # value for the key. If the key is not found, then the default open_text # database name is used. #************************************************************************* if (@indexlist > 0){ @indexlist = &stem(@indexlist); if ($boldlist{$indexname}) { $db = &database_print($db, $boldlist{$indexname}, $filelist, @indexlist); } else { push(@indexlist, $indexname); $indexname = 'open_text'; $db = &database_print($db, $opentext, $filelist, @indexlist); } } } close (INFILE) ; # Close the two files } print ("$0 terminated normally\n") ; exit (0); # Begin Subroutines ------------------------------------ # component.pl # This file represents components designed to be added to the indexing # script. The 'use' line should be placed at the top of the script; it's # much like an #include is to the C preprocessor. The two subroutine decls # can occur anywhere in the script. database_start needs to be called early # on while database_print currently must be called once per line # the use line. Place at top of script. # use GDBM_File; # database_start # This routine initializes the databases--creates them and dumps to them. # Presently, we're dumping to /all/ the boldwords that are supplied in the # argument list, so if we want to do only one, only call it with the one # argument. sub database_start { my($prefix , @bolds) = @_; # argument list: boldwords my(%databases); # one reference to a gdbm hash per bold my(%dbrefs); # one reference to a gdbm class per bold my(%dbstruct); # a hash to hold both dbrefs and databases # pluck out unique 'boldwords' my(%tmphash); foreach $bold (@bolds) { $tmphash{$bold}++; } @bolds=keys(%tmphash); foreach $bold (@bolds) { # for each boldword in our arg list my(%db); # block-local: cleared each time thru loop my($dbnum); # %db for hash, $dbnum for class reference my($filename) = $prefix . $bold.".gdbm"; # perform the tie() -- see man perlfunc/man perltie/man GBDM_File $dbnum = tie(%db, GDBM_File, $filename , &GDBM_WRCREAT, 0664); # take reference to hash and put in %databases, put class ref into # %dbref $databases{$bold}=\%db; $dbref{$bold}=$dbnum; } # do next boldword # put references to the hash and class collections into %dbstruct $dbstruct{'databases'}=\%databases; $dbstruct{'references'}=\%dbref; # return a reference to %dbstruct return(\%dbstruct); } # database_start needs to be called after the boldwords are initialized. Our # final draft should call it thus: # $db = &database_start(keys(%boldwords)); # database_print needs to be called after collecting all the words from a line # into an array. A useful block of code to do this might be: # # if ($filelist && $boldword && @words ) { # $db = &database_print($db, $boldword, $filelist, @words); # } # # database_print: # This routine should neatly substitute for the print statement presently in # place. If we need to chop it back so it only handles one word at a time # rather than an array of words, change the @ to a $ and stick a # at the # front of the foreach and its closing bracket sub database_print { my ($dbs, $boldword, $document, @words)=@_; foreach $word (@words) { # will repeat for each word in the line # The dereferencing here is all done inline to prevent repeatedly # re-hashing new references after modifying copies. In other words, # it's simpler to actually look at this mess three times than to # unpack the contents one level at a time. # The dereferencing looks like this: # {$dbs}->{'databases'}->{$boldword}->{$word} if (($oldval = ${${${$dbs}{'databases'}}{$boldword}}{$word}) && ($oldval !~ /\b$document\b/)) { ${${${$dbs}{'databases'}}{$boldword}}{$word}=$oldval.' '.$document; } elsif (!$oldval) { # if no documents for this word ${${${$dbs}{'databases'}}{$boldword}}{$word}=$document; } # fall thru if $document already in hash{$word} } # foreach $word return($dbs); } # # printusage: # This routine merely prints a list of all recognized switches. Good for # when we get a bad switch/option or when something fails. Can call this and # die in the event of an error, for example. sub printusage { return ( "Usage:\n". "$0 [-h]\n". "$0 [-d][-i in-dir] [-o out-dir] [-b bold-file] [-s stop-file]\n". "\n". "-h prints this message\n". "-d run in debugging mode\n". "-i in-dir snarf input from in-dir\n". "-o out-dir plop databases into out-dir (create if nonexistant)\n". "-b bold-file use bold-file as list of boldwords (headings)\n". "-s stop-file use stop-file as list of stopwords\n"); }