#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

/*
	search.c - Search Engine v1.0

	Purpose:
	- Provide searching capability for a web site.

	How it works:
	- Searches the "keywords" sections to find a match.
	- If a match is found, the title and description of the htm file
	  are displayed.
	- The <META NAME=keywords> tag, and the <META NAME=description> tag
	  are automatically generated by nvpdump9.cgi.

	Requirements:
	- Must include the file, "links.txt". ("links.txt" contains a list
	  of html/htm files found in the web site.)
	- "links.txt" should be located in c:\public-html\

	Mitchell Lau - November 1, 98
*/

#define MAX_ENTRIES 200			/* Max. number of keyword entries */
#define MAX_LENGTH 300			/* Max. length of each keyword    */
#define MAX 1000	     			/* Max. length of each string     */
#define CR 13
#define LF 10

typedef struct
{
	char *name;
	char *val;
} entry;

typedef struct
{
	char *filename;
	char *title;
	char *descrip;
	int hits;
} result;

char *makeword(char *line, char stop);
char *fmakeword(FILE *f, char stop, int *len);
void unescape_url(char *url);
char x2c(char *what);
void plustospace(char *str);
int findMETA (char textIn[], char *whatTAG);
char* parseText (char textIn[], char *whatTAG);
int skipTags (char dataIn[]);
int foundMatch (char keywords[], char searchWords[]);

int main()
{
	int cl, x=0, etnum=0, stringLen=0;
	int i=0, j=0, wordCount=1;		/* i = inputWord number */
							/* j = char position #  */
	entry entries[MAX_ENTRIES];
	result results[MAX_ENTRIES], swap;
	char searchClause[MAX_ENTRIES][MAX_LENGTH];
	char searchClauseLowCase[MAX_ENTRIES][MAX_LENGTH];
	char *temp, *description;

	FILE *f;   			/* For handling links.txt      */
	FILE *g;			/* For handling html/htm files */
	char msg[MAX]="", data[MAX]="", pathname[MAX]="";
	char tempPathname[MAX]="", *puretext="", checkTitle[MAX];
/* ****** NAME OF THE ROOT DIRECTORY ***** */
	const char *path = "c:\\public-html\\";
	int foundKeywords=0, y=0, searchFound = 0;

	char keywordsHTML[MAX][MAX_LENGTH];
	int pureTextLen = 0;
	int keywordCount = 1, numberOfHits = 0, resultCnt = 0;

	char *META_keywordsTag = "<meta name=keywords content=";
	char *META_descriptionTag = "<meta name=description content=";
	char title[MAX] = "No Title";

	cl = atoi(getenv("CONTENT_LENGTH"));
	etnum = 0;
	for(x=0; cl && (!feof(stdin));x++)
	{
	    entries[x].val = fmakeword(stdin,'&',&cl);
	    plustospace(entries[x].val);
	    unescape_url(entries[x].val);
	    entries[x].name = makeword(entries[x].val,'=');
	    etnum++;
	}

	printf("Content-type: text/html\n\n");
	printf("<HTML>\n");
	printf("<HEAD><TITLE>Search Results</TITLE></HEAD>\n");
	printf("<BODY BGCOLOR=white text=black link=blue vlink=blue alink=red>\n");
	printf("<H1>Search Results</H1>\n");
	printf("<P>Search string: \42<B>%s</B>\42\n", entries[0].val);
	printf("<BR><CENTER><HR></CENTER>\n");

	/* Error checking */
	if (strlen(entries[0].val) == 0)
	{
	   printf("<CENTER>\n");
	   printf("Ha Ha Ha. Very funny. Now type something or leave me alone!!<br>\n");
	   printf("</CENTER>\n");
	   printf("</BODY>\n");
	   printf("</HTML>\n");
	   searchFound = 1;
	   exit;
	}
	/* -------------- */

	/* ----- Parses search clause into individual words ----- */
	temp = entries[0].val;				/* backup entries[0].val */
	stringLen = strlen(entries[0].val);
	for (x=0; x<=stringLen; x++)
	{
		searchClause[i][j] = *entries[0].val;
		if (*entries[0].val == ' ')
		{
			searchClause[i][j] = '\0';
			wordCount++;
			i++;                   		/* i = inputWord number */
			j = -1;				/* j= char position # */
							/* j=-1 since it'll be incremented below */
		}
		if (x == stringLen)
		    /* Insert end-of-line character for the last word. */
		    searchClause[i][j] = '\0';
		entries[0].val++;
		j++;
	}
	strcpy (entries[0].val, temp);
	/* ------------------------------------------------ */

	/* Convert searchClause into lower case */
	for (x=0; x<=wordCount; x++)
	{
	     strcpy(searchClauseLowCase[x], searchClause[x]);

	     for (i=0; i<strlen(searchClauseLowCase[x]); i++)
	     {
		  searchClauseLowCase[x][i] = tolower(searchClauseLowCase[x][i]);
	     }
	}
	/* ------------------------------------ */

	/* Search html/htm files */
/* ***** "LINKS.TXT" IS ASSUMED TO BE LOCATED AT C:\PUBLIC_HTML.  MODIFY IF NECESSARY ***** */
	f = fopen ("c:\\public-html\\links.txt", "rt");
	strcat (tempPathname, path);

	while (!feof(f))
	{
	  /* Read the names of the html/htm files from links.txt */
	  fgets (msg, MAX, f);
	  strcat (tempPathname, msg);
	  i = strlen(tempPathname) - 1;
	  strncpy (pathname, tempPathname, i);
	  strcpy (tempPathname, path);
	  /* --------------------------------------------------- */

	  /* Open the html/htm file and read the content */
	  keywordCount = 1; 		/* Reset keyword counter */
	  g = fopen (pathname, "r");
	  while (!feof(g))
	  {
		fgets (data, MAX, g);

		/* Check if it is a <Title> tag */
		x = 0;
		strcpy (checkTitle, data);
		strncpy (checkTitle, checkTitle, 7);
		checkTitle[7] = '\0';

		if (!strcmp(checkTitle, "<Title>") ||
		    !strcmp(checkTitle, "<TITLE>") ||
		    !strcmp(checkTitle, "<title>"))
		{
		for (i=7; i<strlen(data); i++)
		{
		    if (data[i] != '<')
		    { title[x++] = data[i]; }
		    else
		    {
		      title[x] = '\0';
		      break;
		    }
		}

		}
		/* ---------------------------- */

		foundKeywords = findMETA(data,META_keywordsTag);

		/*
		   - If the keyword section is found, parse the keywords
		     into individual words.
		   - If the keyword section is not found, look in the
		     next line of HTML code.
		   - Note: foundKeywords == 1 doesn't mean the search has
		     found a match!
		*/
		if (foundKeywords == 1)
		{
		   puretext = parseText(data,META_keywordsTag);

		   /* Parse puretext into individual keywords */
		   pureTextLen = strlen(puretext);
		   i = 0;
		   j = 0;
		   for (x=0; x<=pureTextLen; x++)
		   {
			keywordsHTML[i][j] = *puretext;
			if (*puretext == ' ')
			{
			    keywordsHTML[i][j] = '\0';
			    keywordCount++;
			    i++;       		/* i = keyword number */
			    j = -1;		/* j= char position # */
			     /* j=-1 since it'll be incremented below */
			}

			if (x == pureTextLen)
			   { keywordsHTML[i][j] = '\0'; }
			puretext++;
			j++;
		   }
		   /* -------------------------------------- */

		   /* Check if the keywords matches the ones user entered */
		   for (y=0; y<wordCount; y++)
		   /* loop through words in the serach clause */
		   {
			for (x=0; x<keywordCount; x++)
			/* loop through keywords found in HTML */
			{
			     if (foundMatch(keywordsHTML[x], searchClauseLowCase[y]))
			     {
				numberOfHits++;
				searchFound = 1;
			     }
			}
			x = 0;
		   }
		   /* --------------------------------------------------- */
		}

	       if (numberOfHits > 0)
	       {
		   results[resultCnt].filename = strdup (msg);
		   results[resultCnt].title = strdup (title);
		   results[resultCnt].hits = numberOfHits;

		   if (title != "No Title")
		   {
		      strcpy (title, "No Title");   /* Reset title for use the next time */
		   }

		   fgets (data, MAX, g);
		   description = parseText(data,META_descriptionTag);
		   results[resultCnt].descrip = strdup(description);
		   resultCnt++;

		   numberOfHits = 0;          /* Clear history */
		   break;
	       }
	  }
	  fclose(g);
	  /* ------------------------------------------- */
	}
	fclose(f);
	/* -------------------- */

	/* Handle not found situation */
	if (!searchFound)
	{
	   printf ("<H4><I> Not Found ... </I></H4>\n");
	   printf ("Your search produced no results.<BR>\n");
	   printf ("<P>Please refer to the following hints for better search results:\n");
	   printf ("<OL>\n");
	   printf ("<LI> Use more specific terms. (eg. <I> resistors </I> instead of <I> components </I>) \n");
	   printf ("<LI> Insert spaces between words. Do not use commas.\n");
	   printf ("<LI> Check the spelling.\n");
	   printf ("</OL>\n");
	}
	else
	{

	  for (x=0; x<resultCnt-1; x++)
	  {
	      for (i=0; i<resultCnt-1; i++)
	      {
		  if (results[i].hits < results[i+1].hits)
		  {
		      swap = results[i];
		      results[i] = results[i+1];
		      results[i+1] = swap;
		  }
	      }
	  }
	   for (i=0; i<resultCnt; i++)
	   {
/* ***** MODIFY THE NAME OF THE URL! ***** */
	       printf ("<P><dl><dt><FONT SIZE=2>%d%%  </FONT><A HREF=\42http://www.URLname.com/%s\42>%s</A><BR>\n", (results[i].hits * 100 / wordCount), results[i].filename,results[i].title);
	       printf ("<dd>%s\n", results[i].descrip);
	       printf ("</dl>\n");
	   }
	}

	printf("<P>&nbsp<P>\n");
	printf("<CENTER><BR><FONT SIZE=2>\n");
	printf("<P>&nbsp<P><P>&nbsp<P><P>&nbsp<P>\n");
	printf("Search Engine v1.0<BR>\n");
	printf("</CENTER></FONT>\n");
	printf("</BODY>\n");
	printf("</HTML>\n");
	return 0;
}


/* Find <META NAME=keywords CONTENT="..."> */
int findMETA (char textIn[], char *whatTAG)
{
    int found = 0, length = 0;
/*    char *keywords = "<meta name=keywords content="; */
    int keywordsLen = 0, i = 0;

    keywordsLen = strlen(whatTAG);

    /* Convert textIn to all lower case before comparing. */
    for (i=0; i<strlen(textIn); i++)
    {
	 textIn[i] = tolower(textIn[i]);
    }
    length = strspn(textIn, whatTAG);

    /* If length = keywordsLen, we've found the keywords section. */
    if (length == keywordsLen)
       { found = 1; }

    return (found);
}


/* Extract the keywords and remove the HTML tags */
char* parseText (char textIn[], char *whatTAG)
{
      char textOut[MAX]="";
/*      char *keywords = "<meta name=keywords content="; */
      int startPosition = 0, i = 0;
      int count = 0;
      int keywordsLen = 0, textInLen = 0;

      keywordsLen = strlen(whatTAG);
      textInLen = strlen(textIn);

      /* Skip HTML tags. */
      startPosition = keywordsLen + 1;		/* The +1 is for " after = */
      for (i=startPosition; i<textInLen; i++)
      {
	   if (textIn[i] != '\42')
	       { textOut[count++] = textIn[i]; }
	   else
	       { break; }
      }
      return (textOut);
}


/* Skip an HTML tag and return the position of where the 1st tag ends */
/* eg. <TITLE> is the 1st tag.  </TITLE> is the 2nd tag. 	      */
int skipTags (char dataIn[])
{
      char dataOut[MAX];
      int i = 0;		/* i = the end of the first HTML tag */

      /* Skip the beginning HTML tag */
      if (dataIn[i] == '<')
      {
	  while (dataIn[i] != '>')
	  {
		i++;
	  }
      }
      return(i);
}


/* Check if the keywords in an HTML/HTM file match the search clause */
int foundMatch (char keywords_HTML[], char searchWords[])
{
    int i = 0;
    i = strcmpi(keywords_HTML, searchWords);

    if (i == 0)
    { return 1; }
    else
    { return 0; }
}


/* HTML utilities */
char *makeword(char *line, char stop) {
  int x = 0,y;
  char *word = (char *) malloc(sizeof(char) * (strlen(line) + 1));
  for(x=0;((line[x]) && (line[x] != stop));x++)
     word[x] = line[x];
  word[x] = '\0';
  if(line[x]) ++x;
  y=0;
  while(line[y++] = line[x++]);
  return word;
}

char *fmakeword(FILE *f, char stop, int *cl) {
  int wsize;
  char *word;
  int ll;

  wsize = 102400;
  ll=0;
  word = (char *) malloc(sizeof(char) * (wsize + 1));

  while(1) {
    word[ll] = (char)fgetc(f);
    if(ll==wsize) {
       word[ll+1] = '\0';
       wsize+=102400;
       word = (char *)realloc(word,sizeof(char)*(wsize+1));
    }
    --(*cl);
    if((word[ll] == stop) || (feof(f)) || (!(*cl))) {
       if(word[ll] != stop) ll++;
       word[ll] = '\0';
       return word;
    }
    ++ll;
  }
}

void unescape_url(char *url) {
  register int x,y;
  for(x=0,y=0;url[y];++x,++y) {
    if((url[x] = url[y]) == '%') {
	url[x] = x2c(&url[y+1]);
	y+=2;
    }
  }
  url[x] = '\0';
}

char x2c(char *what) {
  register char digit;
  digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A')+10 : (what[0] - '0'));
  digit *= 16;
  digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A')+10 : (what[1] - '0'));
  return(digit);
}

void plustospace(char *str) {
  register int x;
  for(x=0;str[x];x++)
    if(str[x] == '+') str[x] = ' ';
}
