#!/usr/bin/perl
#############################################################################
# PERL Program "fnaindex.pl"
# 
# March 28th, 1998
#
# AUTHORS
# Anton Chuppin
# Geoff Gerrietts
# Eric Larson
# John laPlante
#
# PURPOSE
# This program is designed to test parsing logic for HTML pages derived form
# the Flora of North America (FNA).
#
# FNA STRUCTURE
# The principal structure being tested in this program is the usefullness of
# using words printed in BOLD as a mark-up langauge for searching web pages
# derived from the FNA.  It was determined that the morphological descriptions
# have a reasonably controlled vocabulary when describing the distinctive
# features of the genus or species.  The goal of this program is to determine
# whether using BOLD words as mark-up delimiters is useful.  Usually, the
# bold descriptor (leaves, bark stems) was followed by descriptive text.  The
# most useful text was found to be in one paragraph (ended by a <P> html
# markup tag.
#
# BOLDWORD FILE
# This program requires a list of bold words sifted from the collection.  A
# program called boldword.pl is used to create the needed text file.  The
# boldword.pl program sifts out all word delimited by bold tags and stores 
# them into a file.  In tests, approximately 3.5 megabytes of text were 
# sifted into 100 lines of bold words.  The resultant file was cut to
# approximately 35 by manual processing to eliminate obviously errant 
# and non-descriptive words.
#
# STOPWORD FILE
# A simple stop word file is also used by the program.  Each stop word
# is on one-line.
#
# COMMAND-LINE SYNTAX
# index.pl [-d] [-i in-dir] [-o out-dir] [-b bold-wrd] [-s stop-wrd]
#   index.pl is the script itself
#   -d turns on debugging (default off)
#   -i is the input directory (default 'fnadata')
#   -o is the output directory (default 'indexes')
#   -b is the boldword file (default 'bold.wrd')
#   -s is the stopword file (default 'stop.wrd')
#
# OUTPUT FILE
#   returns gdbm databases equal in count to the number of boldwords
#
###################################################################
 
# Include file for GDBM database and initialization of the GDBM database. 

# the use line. Place at top of script.
use GDBM_File;
use Getopt::Std;
require '/usr2/people/lis429/public_html/stem.pl';

unless (getopts('i:o:s:b:hd')) {
    print STDERR "Illegal option.\n";
    die(&printusage());
}

# Enable better file writing (hopefully)
umask(000);

# Variable for default gdbm database where no indexable term is found at
# the start of the line.    
$opentext = "open_text" ;

# Test for proper command line, exit if not
if ($opt_h) {
	print &printusage();
	exit(0);
}

$debugging = ($opt_d || 0);
$stopfile = ($opt_s || "stop.wrd");
$boldfile = ($opt_b || "boldword.sort");
$subdir = ($opt_i || "fnaprocess");
if ($subdir !~ /\/$/) {
	$subdir .= '/';
}
$outdir = ($opt_o || "indexes");
if ($outdir !~/\/$/) {
	$outdir .= '/';
}

# test for simple failures early on

unless (((-d $outdir)&&(-w $outdir)) || mkdir($outdir,0775)) {
    die ("$0 died -- output directory $outdir could not be created 
        or written to\n");
}

unless (open (STOPFILE, "$stopfile")) {   # Pick-up stop-file
    die ("$0 Died -- Stop file \"$stopfile\" could not be input\n") ;
}

unless (open (BOLDFILE, "$boldfile")) {   # Pick-up boldword file
    die ("$0 Died -- Boldword file \"$boldfile\" could not be input\n") ;
}

unless (opendir (DATADIR, $subdir)) { ;    # Open up the data sub-directory
    die "$0 Died -- $subdir data input sub-dir cannot be opened" ;
}

# read in stopwords

while ($stopword = <STOPFILE>) {  # Read a line of text, parse, translate, save

    chop($stopword) ;                     # Remove the ending newline
    $stopword =~ tr/A-Z/a-z/ ;            # Convert to lowercase
    $stopword =~ tr/-/ / ;                # Convert -'s to spaces
    $stopword =~ s/ +//g ;                # Remove blanks

    if ($stopword ne "") {                # Don't create NULL entries
        $stoplist{"$stopword"} = 1 ;      # Create/fill Assoc array
    }
}

close STOPFILE ;

# Read in boldwords -- the goal is to create an associative array where
# the "to be tested boldword" is the hash key, and the value is the name of
# the gdbm file that is supposed to hold the data.  There can be multiple
# copies of the "value," but there should only be one instance of the "key." 
# The structure of the  boldword file is:  
# test_word \t gdbm_name \n  
# with no spaces.  The list needs to be in small letters and should use
# underlines for multi-word boldwords.  The processing done by fnaprocess.pl
# should ensure that each boldword already has the underlines and that the
# first word on the line is the candidate boldword.  A good example is the
# boldword.sort that was created by hand and should be in the same sub-dir
# as this code.  Beware of the if test that looks to see if the word parsed
# from the file is a key to this associative array -- I don't know for sure
# how to test this.

while ($boldword = <BOLDFILE>) {  # Read a line of text, parse, translate, save

    chomp($boldword) ;                     # Remove the ending newline
    $boldword =~ s/ +//g ;                 # Remove blanks
    @Boldinput = split(/\t/, $boldword);   # Split on the tab character
    
    if ($Boldinput[0] ne "") {                # Don't create NULL entries
        $boldlist{$Boldinput[0]} = $Boldinput[1] ;   # Create/fill Assoc array
    }
}

close BOLDFILE ;

# Call the sub to create the gdbm databases.
$db = &database_start($outdir, values(%boldlist)) ;
 
while ($filelist = readdir(DATADIR)) {
  $readfile = $subdir . $filelist ;
  unless (open(INFILE, "$readfile")) {         # Open the input file
     print STDERR "$0 Error: ".
        "$filelist input file cannot be opened -- skipping\n" ;
  }

#  unless (open (OUTFILE, ">>$outfile")) {     # Open the output file
#    if (!(-w $outfile)) {
#      die ("$0 Died -- No write permissions to $outfile \n") ;
#    } else {
#      die ("$0 Died -- Outfile $outfile cannot be opened \n") ;
#    }
#  }
 
  while ($line = <INFILE>) {
    chomp($line) ;
    $line =~ tr/A-Z/a-z/;  
    $line =~ tr/-/_/ ;
    $line =~ s/\&nbsp\;/_/ ;
    $line =~ s/\<.*\>/ /g ;
	my(@WordList,@indexlist) = ();
    @WordList = split(/\s+/, $line);  
    $indexname= shift(@WordList) ; 

    foreach $Word (@WordList) {            # Inspect each word 
      if (!($stoplist{$Word} eq "1")) {    # Test word to stoplist
         $Word =~ s/ +//g;
         $Word =~ s/\W//g;
         if($Word ne ""){
           push (@indexlist, $Word);
         }
      } 
    }
#*************************************************************************
# The "if ($boldlist{$indexname})" IF test is untested code -- beware of it
# The goal is to test the key, then save to the gdbm file that is the
# value for the key.  If the key is not found, then the default open_text
# database name is used.
#*************************************************************************
    if (@indexlist > 0){    
       @indexlist = &stem(@indexlist);
       if ($boldlist{$indexname})  {
          $db = &database_print($db, $boldlist{$indexname}, $filelist, @indexlist);
       } else {
         push(@indexlist, $indexname);
         $indexname = 'open_text';
         $db = &database_print($db, $opentext, $filelist, @indexlist);
       }
    }
  }
  close (INFILE) ;                          # Close the two files
}    

print ("$0 terminated normally\n") ;
exit (0);


# Begin      Subroutines      ------------------------------------
# component.pl
#   This file represents components designed to be added to the indexing
#   script. The 'use' line should be placed at the top of the script; it's
#   much like an #include is to the C preprocessor. The two subroutine decls
#   can occur anywhere in the script. database_start needs to be called early
#   on while database_print currently must be called once per line


# the use line. Place at top of script.
# use GDBM_File;


# database_start
#   This routine initializes the databases--creates them and dumps to them.
#   Presently, we're dumping to /all/ the boldwords that are supplied in the
#   argument list, so if we want to do only one, only call it with the one
#   argument.

sub database_start {
	my($prefix , @bolds) = @_;	# argument list: boldwords
	my(%databases);				# one reference to a gdbm hash per bold
	my(%dbrefs);				# one reference to a gdbm class per bold
	my(%dbstruct);				# a hash to hold both dbrefs and databases

	# pluck out unique 'boldwords'
	my(%tmphash);
	foreach $bold (@bolds) {
		$tmphash{$bold}++;
	}
	@bolds=keys(%tmphash);


	foreach $bold (@bolds) {	# for each boldword in our arg list
		my(%db);				# block-local: cleared each time thru loop
		my($dbnum);				# %db for hash, $dbnum for class reference

		my($filename) = $prefix . $bold.".gdbm";

		# perform the tie() -- see man perlfunc/man perltie/man GBDM_File
		$dbnum = tie(%db, GDBM_File, $filename , &GDBM_WRCREAT, 0664);

		# take reference to hash and put in %databases, put class ref into
		# %dbref
		$databases{$bold}=\%db;
		$dbref{$bold}=$dbnum;
	} # do next boldword

	# put references to the hash and class collections into %dbstruct
	$dbstruct{'databases'}=\%databases;
	$dbstruct{'references'}=\%dbref;

	# return a reference to %dbstruct
	return(\%dbstruct);
}

# database_start needs to be called after the boldwords are initialized. Our
# final draft should call it thus:
# $db = &database_start(keys(%boldwords));


# database_print needs to be called after collecting all the words from a line
# into an array. A useful block of code to do this might be:
#
# if ($filelist && $boldword && @words ) {
# 	$db = &database_print($db, $boldword, $filelist, @words);
# }

#
# database_print: 
# 	This routine should neatly substitute for the print statement presently in
# 	place. If we need to chop it back so it only handles one word at a time
# 	rather than an array of words, change the @ to a $ and stick a # at the
# 	front of the foreach and its closing bracket

sub database_print {
	my ($dbs, $boldword, $document, @words)=@_;

	foreach $word (@words) { 	# will repeat for each word in the line

		# The dereferencing here is all done inline to prevent repeatedly
		# re-hashing new references after modifying copies. In other words,
		# it's simpler to actually look at this mess three times than to
		# unpack the contents one level at a time.
		# The dereferencing looks like this:
		# {$dbs}->{'databases'}->{$boldword}->{$word}
		if (($oldval = ${${${$dbs}{'databases'}}{$boldword}}{$word}) && 
			($oldval !~ /\b$document\b/)) { 
			${${${$dbs}{'databases'}}{$boldword}}{$word}=$oldval.' '.$document;
		} elsif (!$oldval) { # if no documents for this word
			${${${$dbs}{'databases'}}{$boldword}}{$word}=$document;
		} # fall thru if $document already in hash{$word}
	} # foreach $word


	return($dbs);
}

#
# printusage:
#   This routine merely prints a list of all recognized switches. Good for
#   when we get a bad switch/option or when something fails. Can call this and
#   die in the event of an error, for example.

sub printusage {
	return (
"Usage:\n".
"$0 [-h]\n".
"$0 [-d][-i in-dir] [-o out-dir] [-b bold-file] [-s stop-file]\n".
"\n".
"-h              prints this message\n".
"-d              run in debugging mode\n".
"-i in-dir       snarf input from in-dir\n".
"-o out-dir      plop databases into out-dir (create if nonexistant)\n".
"-b bold-file    use bold-file as list of boldwords (headings)\n".
"-s stop-file    use stop-file as list of stopwords\n");
}