#!/usr/bin/perl #by Alex Frakt - frakt@alexia.lis.uiuc.edu #by Anton Chuppin - achuppin@alexia.lis.uiuc.edu $inpath=$ARGV[0]; $outpath=$ARGV[1]; opendir (DIR, $inpath); @files = readdir(DIR); closedir(DIR); $filename=10000; $extension ="html"; foreach $file (@files) { print"$file\n"; $i=0; $gdescription = ""; $pdescription = ""; $nogdescription = 0; #flag set to 1 if there is no genus description $nogdescriptor = 0; #flag set to 1 if there is no genus descriptors $longline=""; $infile=$inpath.$file; print"$infile\n"; open(INFILE, "$infile") || die "can't open infile"; while () { $line = $_; $line =~ s/\n/ /; $longline = $longline.$line; } $longline =~ s/\/ /g; # delete

$longline =~ s/\<\/P\>/ ENDPARA /g; # Protect

as " ENDPARA " $longline =~ s/\/ BEGINBOLD /g; # Protect as " BEGINBOLD " $longline =~ s/\<\/B\>/ ENDBOLD /g; # Protect as " ENDBOLD " $longline =~ s/\<.*?\>/ /g; # delete HTML tags #@scounter tells us how many different species entries are found in file #minimum should be two (start of file and start of genus descriptor) @scounter=split(/BEGINBOLD +[0-9]+[a-z]?\./,$longline); $longline =~ s/ENDPARA/\<\/P\>/g; # Put

back $longline =~ s/BEGINBOLD/\/g; # Put back $longline =~ s/ENDBOLD/\<\/B\>/g; # Put back #make sure genus and species identifier paragraphs are in expected format $longline =~ s/(\ +[0-9]+[a-z]?\.)/\1 /g; @gypak=split("\<\/P\>",$longline); ##################################################### # BEGIN PROCESS SINGLE ENTRY (GENUS ONLY) FILE # ##################################################### if (@scounter<3) #indicates only genus info (no species info) in file { #move to paragraph containing genus identifier while (($gypak[$i]!~/\ +[0-9]+[a-z]?\./) && ($i<@gypak)) { $i++; } @pspecies = split(/\s+/,$gypak[$i]); open(OUT, ">$outpath$filename.$extension"); #print HTML header junk and first two lines of text print OUT "\n\n$filename.html\n\n\n\n"; print OUT "\Source\<\/B\> $file \\n"; print OUT "\Filename\<\/B\> $filename.html \\n"; for ($j=0; $j <= @pspecies; $j++) { if ($pspecies[$j]=~/\/) { if ($pspecies[($j+1)]=~/[0-9]+\./) { $j=$j+2; print OUT "\Genus\<\/B\> $pspecies[($j)] \\n"; $j++; print OUT "\Species\<\/B\> \\n"; print OUT "\Variant\<\/B\> $pspecies[$j]\\n"; $j++; print OUT "\Reference\<\/B\>\
\n"; $j++; #skip "-" before common name print OUT "\Common Name\<\/B\>"; for ($j; $j<=(@pspecies-1) ;$j++) { print OUT " $pspecies[$j]"; } print OUT "\\n"; $j++; } } } $i++; #check if paragraph contains descriptors $t = $gypak[$i]; if (($t=~/\/)&&($t!~/\ +[0-9]+[a-z]?\./)&&($t!~/SELECTED REFERENCE/)) { print OUT "$gypak[$i] \\n"; $i++; } #if no bold assume paragraph is range and output it if ($gypak[$i]!~/\/) { print OUT "\Range\<\/B\> $gypak[$i] \\n"; $i++; } #build genus description if there is one, exit loop #if you've gotten to SELECTED REFERENCE or first line of species for ($q; $i<@gypak; $q++) { if ($gypak[$i]=~/\/) { if ($q==0) {$nogdescription=1}; last; } $gdescription=$gdescription.$gypak[$i]; $i++; } if ($nogdescription!=1) { print OUT "\Description\<\/B\> $gdescription \\n"; } if ($gypak[$i]=~/REFERENCE/) { print OUT "$gypak[$i] \\n"; $i++; } #print HTML footer junk print OUT "\n\n\n"; close(OUT); $filename++; } ##################################################### # END PROCESS SINGLE ENTRY (GENUS ONLY) FILE # ##################################################### else # (@scounter>3) ##################################################### # BEGIN PROCESS GENUS STUFF (multiple entries) # ##################################################### { #move to paragraph containing genus identifier while (($gypak[$i]!~/\ +[0-9]+[a-z]?\./) && ($i<@gypak)) { $i++; } #move past paragraph containing genus identifier $i++; #check if paragraph is junk if ($gypak[$i]!~/\/) {$i++;} #check if paragraph contains descriptors $t = $gypak[$i]; if (($t=~/\/)&&($t!~/\ +[0-9]+[a-z]?\./)&&($t!~/SELECTED REFERENCE/)) { $gdescriptor = $t; $i++; } else { $nogdescriptor=1; } #this increment will skip Range if it exists if ($gypak[$i]!~/\/) {$i++}; #build genus description if there is one, exit loop #if you've gotten to SELECTED REFERENCE or first line of species for ($q=0; $i<=@gypak; $q++) { if ($gypak[$i]=~/\/) { if ($q==0) {$nogdescription=1}; last; } $gdescription=$gdescription.$gypak[$i]; $i++; } if ($gypak[$i]=~/REFERENCE/) { $i++; } ##################################################### # END PROCESS GENUS STUFF (multiple entries) # ##################################################### ##################################################### # BEGIN PROCESS SPECIES STUFF (multiple entries) # ##################################################### for ($q=3;$q<=@scounter;$q++) { #Look in immediate vicinity for species identifier paragraph #if we are not at one already. Example of species id paragraph follows: # 1. Taxus brevifolia Nuttall, N. Amer. Sylv. 3: 86, plate 108. 1849 - Pacific yew) if ($gypak[$i]!~/\ +[0-9]+[a-z]?\./) { $i--; if ($gypak[$i]!~/\ +[0-9]+[a-z]?\./) { $i=$i+2; if ($gypak[$i]!~/\ +[0-9]+[a-z]?\./) { print "Could not process $file\n"; die; } } } @pspecies = split(/\s+/,$gypak[$i]); open(OUT, ">$outpath$filename.$extension"); #print HTML header junk print OUT "\n\n$filename.html\n\n\n\n"; print OUT "\Source\<\/B\> $file \\n"; print OUT "\Filename\<\/B\> $filename.html \\n"; #gets you to the first bold for ($j=0; $j <= @pspecies; $j++) { #D if ($pspecies[$j]=~/\/) #C { ####BEGIN PARSE SPECIES IDENTIFIER PARAGRAPH#### if ($pspecies[($j+1)]=~/[0-9]+[a-z]?\./) #B { $j=$j+2; print OUT "\Genus\<\/B\> $pspecies[($j)] \\n"; $j++; print OUT "\Species\<\/B\> $pspecies[($j)]\\n"; $j++; print OUT "\Variant\<\/B\>"; for ($j; $pspecies[$j] !~/\<\/B\>/ ;$j++) { print OUT " $pspecies[$j]"; } print OUT "\\n"; $j++; #to skip "" if ($pspecies[$j] =~/\,/) #to skip "," if exists { $j++; } print OUT "\Reference\<\/B\>"; for ($j;($j<=@pspecies)&&($pspecies[$j] !~/\-/) ;$j++) { print OUT " $pspecies[$j]"; } print OUT "\\n"; ####BEGIN COMMON NAME#### print OUT "\Common Name\<\/B\>"; if ($pspecies[$j] eq "-") #A { $j++; for ($j; $j<=(@pspecies-1) ;$j++) { print OUT " $pspecies[$j]"; } } else { $j=@pspecies; } #A print OUT " \\n"; ####END COMMON NAME#### } #B } #C } #D $i++; #got to next paragraph #first paragraph following species name info usually contains junk #if junk paragraph exists, skip it if ($gypak[$i]!~/\/) { $i++; } #check if paragraph contains descriptors $t = $gypak[$i]; if (($t=~/\/)&&($t!~/\ +[0-9]+[a-z]?\./)&&($t!~/SELECTED REFERENCE/)) { print OUT "$gypak[$i] \\n"; $i++; } if ($nogdescriptor!=1) { print OUT "$gdescriptor \\n"; } #if no bold assume paragraph is range and output it if ($gypak[$i]!~/\/) { print OUT "\Range\<\/B\> $gypak[$i] \\n"; $i++; } if ($gypak[$i]!~/\/) { #build species description for ($i; $i<=@gypak; $i++) { #exit loop if BOLD in paragraph if ($gypak[$i]=~/\/) { $i--; last; } $pdescription=$pdescription.$gypak[$i]; } print OUT "\Description\<\/B\> $pdescription \\n"; } $pdescription=""; #reinitialize species description if ($nogdescription != 1) { print OUT "\Description\<\/B\> $gdescription \\n"; } $i++; if ($gypak[$i]=~/REFERENCE/) { print OUT "$gypak[$i] \\n"; $i++; } #print HTML footer junk print OUT "\n\n\n"; close(OUT); $filename++; $j = 0; ##################################################### # END PROCESS SPECIES STUFF (multiple entry files) # ##################################################### } #end of the species specific part of multiple entry file } # end of the species counter close(INFILE); } # end for each file loop