#!/usr/bin/perl
#############################################################################
# PERL Program "fnaindex.pl"
#
# March 28th, 1998
#
# AUTHORS
# Anton Chuppin
# Geoff Gerrietts
# Eric Larson
# John laPlante
#
# PURPOSE
# This program is designed to test parsing logic for HTML pages derived form
# the Flora of North America (FNA).
#
# FNA STRUCTURE
# The principal structure being tested in this program is the usefullness of
# using words printed in BOLD as a mark-up langauge for searching web pages
# derived from the FNA. It was determined that the morphological descriptions
# have a reasonably controlled vocabulary when describing the distinctive
# features of the genus or species. The goal of this program is to determine
# whether using BOLD words as mark-up delimiters is useful. Usually, the
# bold descriptor (leaves, bark stems) was followed by descriptive text. The
# most useful text was found to be in one paragraph (ended by a
html
# markup tag.
#
# BOLDWORD FILE
# This program requires a list of bold words sifted from the collection. A
# program called boldword.pl is used to create the needed text file. The
# boldword.pl program sifts out all word delimited by bold tags and stores
# them into a file. In tests, approximately 3.5 megabytes of text were
# sifted into 100 lines of bold words. The resultant file was cut to
# approximately 35 by manual processing to eliminate obviously errant
# and non-descriptive words.
#
# STOPWORD FILE
# A simple stop word file is also used by the program. Each stop word
# is on one-line.
#
# COMMAND-LINE SYNTAX
# index.pl [-d] [-i in-dir] [-o out-dir] [-b bold-wrd] [-s stop-wrd]
# index.pl is the script itself
# -d turns on debugging (default off)
# -i is the input directory (default 'fnadata')
# -o is the output directory (default 'indexes')
# -b is the boldword file (default 'bold.wrd')
# -s is the stopword file (default 'stop.wrd')
#
# OUTPUT FILE
# returns gdbm databases equal in count to the number of boldwords
#
###################################################################
# Include file for GDBM database and initialization of the GDBM database.
# the use line. Place at top of script.
use GDBM_File;
use Getopt::Std;
require '/usr2/people/lis429/public_html/stem.pl';
unless (getopts('i:o:s:b:hd')) {
print STDERR "Illegal option.\n";
die(&printusage());
}
# Enable better file writing (hopefully)
umask(000);
# Variable for default gdbm database where no indexable term is found at
# the start of the line.
$opentext = "open_text" ;
# Test for proper command line, exit if not
if ($opt_h) {
print &printusage();
exit(0);
}
$debugging = ($opt_d || 0);
$stopfile = ($opt_s || "stop.wrd");
$boldfile = ($opt_b || "boldword.sort");
$subdir = ($opt_i || "fnaprocess");
if ($subdir !~ /\/$/) {
$subdir .= '/';
}
$outdir = ($opt_o || "indexes");
if ($outdir !~/\/$/) {
$outdir .= '/';
}
# test for simple failures early on
unless (((-d $outdir)&&(-w $outdir)) || mkdir($outdir,0775)) {
die ("$0 died -- output directory $outdir could not be created
or written to\n");
}
unless (open (STOPFILE, "$stopfile")) { # Pick-up stop-file
die ("$0 Died -- Stop file \"$stopfile\" could not be input\n") ;
}
unless (open (BOLDFILE, "$boldfile")) { # Pick-up boldword file
die ("$0 Died -- Boldword file \"$boldfile\" could not be input\n") ;
}
unless (opendir (DATADIR, $subdir)) { ; # Open up the data sub-directory
die "$0 Died -- $subdir data input sub-dir cannot be opened" ;
}
# read in stopwords
while ($stopword = ) { # Read a line of text, parse, translate, save
chop($stopword) ; # Remove the ending newline
$stopword =~ tr/A-Z/a-z/ ; # Convert to lowercase
$stopword =~ tr/-/ / ; # Convert -'s to spaces
$stopword =~ s/ +//g ; # Remove blanks
if ($stopword ne "") { # Don't create NULL entries
$stoplist{"$stopword"} = 1 ; # Create/fill Assoc array
}
}
close STOPFILE ;
# Read in boldwords -- the goal is to create an associative array where
# the "to be tested boldword" is the hash key, and the value is the name of
# the gdbm file that is supposed to hold the data. There can be multiple
# copies of the "value," but there should only be one instance of the "key."
# The structure of the boldword file is:
# test_word \t gdbm_name \n
# with no spaces. The list needs to be in small letters and should use
# underlines for multi-word boldwords. The processing done by fnaprocess.pl
# should ensure that each boldword already has the underlines and that the
# first word on the line is the candidate boldword. A good example is the
# boldword.sort that was created by hand and should be in the same sub-dir
# as this code. Beware of the if test that looks to see if the word parsed
# from the file is a key to this associative array -- I don't know for sure
# how to test this.
while ($boldword = ) { # Read a line of text, parse, translate, save
chomp($boldword) ; # Remove the ending newline
$boldword =~ s/ +//g ; # Remove blanks
@Boldinput = split(/\t/, $boldword); # Split on the tab character
if ($Boldinput[0] ne "") { # Don't create NULL entries
$boldlist{$Boldinput[0]} = $Boldinput[1] ; # Create/fill Assoc array
}
}
close BOLDFILE ;
# Call the sub to create the gdbm databases.
$db = &database_start($outdir, values(%boldlist)) ;
while ($filelist = readdir(DATADIR)) {
$readfile = $subdir . $filelist ;
unless (open(INFILE, "$readfile")) { # Open the input file
print STDERR "$0 Error: ".
"$filelist input file cannot be opened -- skipping\n" ;
}
# unless (open (OUTFILE, ">>$outfile")) { # Open the output file
# if (!(-w $outfile)) {
# die ("$0 Died -- No write permissions to $outfile \n") ;
# } else {
# die ("$0 Died -- Outfile $outfile cannot be opened \n") ;
# }
# }
while ($line = ) {
chomp($line) ;
$line =~ tr/A-Z/a-z/;
$line =~ tr/-/_/ ;
$line =~ s/\ \;/_/ ;
$line =~ s/\<.*\>/ /g ;
my(@WordList,@indexlist) = ();
@WordList = split(/\s+/, $line);
$indexname= shift(@WordList) ;
foreach $Word (@WordList) { # Inspect each word
if (!($stoplist{$Word} eq "1")) { # Test word to stoplist
$Word =~ s/ +//g;
$Word =~ s/\W//g;
if($Word ne ""){
push (@indexlist, $Word);
}
}
}
#*************************************************************************
# The "if ($boldlist{$indexname})" IF test is untested code -- beware of it
# The goal is to test the key, then save to the gdbm file that is the
# value for the key. If the key is not found, then the default open_text
# database name is used.
#*************************************************************************
if (@indexlist > 0){
@indexlist = &stem(@indexlist);
if ($boldlist{$indexname}) {
$db = &database_print($db, $boldlist{$indexname}, $filelist, @indexlist);
} else {
push(@indexlist, $indexname);
$indexname = 'open_text';
$db = &database_print($db, $opentext, $filelist, @indexlist);
}
}
}
close (INFILE) ; # Close the two files
}
print ("$0 terminated normally\n") ;
exit (0);
# Begin Subroutines ------------------------------------
# component.pl
# This file represents components designed to be added to the indexing
# script. The 'use' line should be placed at the top of the script; it's
# much like an #include is to the C preprocessor. The two subroutine decls
# can occur anywhere in the script. database_start needs to be called early
# on while database_print currently must be called once per line
# the use line. Place at top of script.
# use GDBM_File;
# database_start
# This routine initializes the databases--creates them and dumps to them.
# Presently, we're dumping to /all/ the boldwords that are supplied in the
# argument list, so if we want to do only one, only call it with the one
# argument.
sub database_start {
my($prefix , @bolds) = @_; # argument list: boldwords
my(%databases); # one reference to a gdbm hash per bold
my(%dbrefs); # one reference to a gdbm class per bold
my(%dbstruct); # a hash to hold both dbrefs and databases
# pluck out unique 'boldwords'
my(%tmphash);
foreach $bold (@bolds) {
$tmphash{$bold}++;
}
@bolds=keys(%tmphash);
foreach $bold (@bolds) { # for each boldword in our arg list
my(%db); # block-local: cleared each time thru loop
my($dbnum); # %db for hash, $dbnum for class reference
my($filename) = $prefix . $bold.".gdbm";
# perform the tie() -- see man perlfunc/man perltie/man GBDM_File
$dbnum = tie(%db, GDBM_File, $filename , &GDBM_WRCREAT, 0664);
# take reference to hash and put in %databases, put class ref into
# %dbref
$databases{$bold}=\%db;
$dbref{$bold}=$dbnum;
} # do next boldword
# put references to the hash and class collections into %dbstruct
$dbstruct{'databases'}=\%databases;
$dbstruct{'references'}=\%dbref;
# return a reference to %dbstruct
return(\%dbstruct);
}
# database_start needs to be called after the boldwords are initialized. Our
# final draft should call it thus:
# $db = &database_start(keys(%boldwords));
# database_print needs to be called after collecting all the words from a line
# into an array. A useful block of code to do this might be:
#
# if ($filelist && $boldword && @words ) {
# $db = &database_print($db, $boldword, $filelist, @words);
# }
#
# database_print:
# This routine should neatly substitute for the print statement presently in
# place. If we need to chop it back so it only handles one word at a time
# rather than an array of words, change the @ to a $ and stick a # at the
# front of the foreach and its closing bracket
sub database_print {
my ($dbs, $boldword, $document, @words)=@_;
foreach $word (@words) { # will repeat for each word in the line
# The dereferencing here is all done inline to prevent repeatedly
# re-hashing new references after modifying copies. In other words,
# it's simpler to actually look at this mess three times than to
# unpack the contents one level at a time.
# The dereferencing looks like this:
# {$dbs}->{'databases'}->{$boldword}->{$word}
if (($oldval = ${${${$dbs}{'databases'}}{$boldword}}{$word}) &&
($oldval !~ /\b$document\b/)) {
${${${$dbs}{'databases'}}{$boldword}}{$word}=$oldval.' '.$document;
} elsif (!$oldval) { # if no documents for this word
${${${$dbs}{'databases'}}{$boldword}}{$word}=$document;
} # fall thru if $document already in hash{$word}
} # foreach $word
return($dbs);
}
#
# printusage:
# This routine merely prints a list of all recognized switches. Good for
# when we get a bad switch/option or when something fails. Can call this and
# die in the event of an error, for example.
sub printusage {
return (
"Usage:\n".
"$0 [-h]\n".
"$0 [-d][-i in-dir] [-o out-dir] [-b bold-file] [-s stop-file]\n".
"\n".
"-h prints this message\n".
"-d run in debugging mode\n".
"-i in-dir snarf input from in-dir\n".
"-o out-dir plop databases into out-dir (create if nonexistant)\n".
"-b bold-file use bold-file as list of boldwords (headings)\n".
"-s stop-file use stop-file as list of stopwords\n");
}