#!/usr2/local/bin/perl -w

# gdbmbld.pl
# this script is designed to work with a tab-delimited file on stdin
# it will append to a key if the key appears a second time in the input

use GDBM_File;

$filename = "whatever.gdbm"; 	# This should undoubtedly be dynamic
my(%database);			# declaration: the database.
my($dbref);			# declaration: the database-descriptor

# The database-descriptor is useful for manipulating the guts of the gdbm
# engine as well as a few choice direct-approach routines that we could futz
# with if we were so inclined. I save it in $dbref, but never use it in this
# application.
# The database by contrast, is in %database, and it behaves just like an
# associative array. The only difference is that it's using the file
# "$filename" for storage instead of relying on an in-memory representation.
# I've added ()'s to the tie() call for clarity--Perl is comfortable either
# with or without them.
$dbref = tie(%database, GDBM_File, $filename, &GDBM_WRCREAT, 0640);

# At this point, %database lives on disk as a GDBM file

# 
# The main body of the routine reads from standard input until nothing's left
# and runs over the principal loop. The expression while (<STDIN>) is a
# special case of while and the <> operator: it's actually reading a line from
# standard input (that's what <STDIN> does) and placing the result into $_ so
# it can test it. As a side-effect, we get the line in $_ without explicit
# assignment. A fairly standard (if opaque) Perl idiom.

while (<STDIN>) {
	
	# let's get rid of those pesky newlines
	chomp();

	# now inside main loop. First split our input line into discrete
	# tokens. Since Perl provides split(), it's easier to just tokenize
	# the entire line than it is to try plucking a single token off and
	# leaving the rest intact.
	@elements = split(/\t/,$_);

	# our key value -- the key into the hash, or our 'word' -- will be the
	# first element in our list. We shift it off the array and into the
	# variable $key
	$key = shift(@elements);

	# Check the DB for existing data. If it exists, we'll process the
	# contents of the if clause. If we don't find a value in
	# $database{$key}, we'll skip the clause. The clause will concatenate
	# new data onto old data.
	if ($database{$key}) {

		# We're only executing this if we have data. If we do have
		# data, we split it on the tab character and put that into our
		# @elembase array. 
		@elembase = split(/\t/,$database{$key});

		# Now a simple concatenation to replace @elements with all the
		# elements instead of just the new ones. If we didn't find old
		# data, @elements will still have the new data in it
		@elements = (@elembase, @elements);
	}
	
	# Replace the tabs so that everything's still tab-delimited.
	$value = join("\t",@elements);

	# Put the value back into the hash.
	$database{$key} = $value;

	# Now repeat loop for next line on STDIN
}

# at this point, we should have done all the lines present on STDIN, and we
# can close everything down. The untie() is just good housekeeping and ensures
# that we write back any in-memory cache the database may be using.
untie(%database);