/**
  * HTML SPIDER
  * @author Robert John Morton
  * @version 03 Oct 1999 modified 13 July 2009 revamped 16 July 2009 */

/* BUILDS A COMPACT INDEX (index.dat) FOR THE SEARCH ENGINE APPLET

   Extracts the keywords from the HTML keyword meta tag of every relevant
   HTML file in the website and builds a compact index file (index.dat) 
   for use by my search-engine applets 'search.java' and 'search.c'. This
   program follows the directory structure of the website: it does not use
   or follow HTML links. If you do not want a file to be indexed, make sure
   it has no keywords meta tag. It strictly only looks for keywords within
   files whose names end with ".html" and not ".htm".

   This program is automatically called by the index-website.sh script in
   the home directory and in '/home/rob/Private/computer/Bash/'. The script
   re-encapsulates the index.dat file, together with the full search engine
   applet 'search.java', into a JAR file, which it places in the top level
   directory of the website. In fact, the script creates 2 versions: one for
   the embedded applet and another for the stand-alone WebStart application. 

   In keywords meta tag contents ="", all individual words must be comma-
   delimited: no groups of words separated by only spaces. 

   COMMAND: cd /home/rob/Private/website/webtools/search-engine
            java spider

   You have the option to enter the absolute path to the website directory
   as a command-line argument. Do NOT put a final slash. If no path is 
   entered, the program uses the default website directory: 
   /home/rob/Private/website

   You can enter a second agument: a sub-directory path below the website
   directory specified in the first arguement. This is a way of limiting
   the indexing to only a specified sub-directory. The relative filespecs
   produced will, however, still be relative to the entered website
   directory (the first arguement). 

   This program generates the following files: 

   filespec_text.dat     contains the relative filespecs of all HTML files
                         in the parent and all its subdirectories. 

   filespecs.txt         contains the relative filespecs as in "filespec
                         _text.dat" but in text form for visual checking.

   filespec_pointers.dat contains the offset for each relative filespec
                         held in filespecs.txt. An extra offset is
                         included at the end of "filespec_text_pointers.dat"
                         to make it easy to find the extent of the last
                         filespec in filespecs.txt. The maximum size
                         of filespecs.txt is determined by the 32-bit
                         size of the offset pointers. 

   An attempt was made to compact the filespecs file by listing directories
   separately and replacing the filepaths by references to the appropriate
   directories in the separate directories file. However, this saved only
   687 bytes in the whole website index file. The extra code necessary in
   "search.java" to handle it would have required a lot more than this. 
   This idea was therefore discarded.

   keywords.tmp          contains every occurrence of every keyword in the
                         website (converted entirely to lower case).
 
   keyword_pointers.tmp  comprises 8-byte records, each of which contains
                         the offset of a keyword within "kwtm.tmp" (32-bits)
                         + the extent of the keyword (16-bits) and the ref-
                         erence number (16-bits) of the HTML file in whose
                         keyword meta tag the keyword occurred. 

   no_keywords.txt       contains filespecs of all relevant HTML files that
                         lack a keywords meta tag.

   no_description.txt    contains the filespecs of all accepted HTML files
                         that lack a description.  */


import java.io.*;        // for file input/output handling


class spider {

  private static int 
    dir_level = 0,  // current directory level used in scan()
    KeyWrdPtr,      // pointer to current keyword in keywords.tmp
    LenDir,         // length of parent directory path + a terminating '/'
    FilePtr = 0,    // pointer for the file filespecs.txt
    NFS = 0xffff,   // max number of filespecs possible in the filespec index
    nfs = 0;        /* number of the current filespec within 
                       filespec_text.dat, filespecs.txt */

  private static boolean  // SET TO TRUE WHEN:
    Kwds = false,         // current HTML file has keywords
    Desc = false,         // current HTML file has a description
    abrt = false;         // run is aborted due to file pointer overflow

  private static String 

    /* The names of all the top-level directories in the website that
    must be indexed. There are other top-level directories which are
    excluded from the indexing process for one reason or another. */

    FOLDERS[] = {"book","chaos","computers","home","internet","landshare",
                 "navigation","poems","radio","science","software"},

    // FILE NAMES
    F00 = "badKeyWords.txt",
    F01 = "number_of_keywords_per_HTML_file.txt",
    F02 = "filespec_text.dat",
    F03 = "filespecs.txt",
    F04 = "no_keywords.txt",
    F05 = "no_descriptions.txt",
    F06 = "filespec_pointers.dat",
    F07 = "keywords.tmp",
    F08 = "keyword_pointers.tmp",
    F09 = "keywords.txt",
    F10 = "keyword_text.dat",
    F11 = "keyword_pointers.dat",
    F12 = "filespec_ref_pointers.dat",
    F13 = "filespec_ref_numbers.dat",

    BaseDir = "/home/rob/Private/website",  // the default base directory

    Tag, tag,          // text of current meta tag, plus lower-case version
    SubDir = "",       // sub-directory
    FullPath = "",     // absolute path to directory currently being indexed
    RelFileSpec = "",  // relative filespec of current HTML file
    Keywords = "",     // current file's comma-delimited keywords
    padding = "                                        ";

  private static Writer
    bkwf,  // contains paths to HTML files containg bad keywords
    kwpf,  // contains "number_of_keywords_per_HTML_file"
    fsts,  // "filespec_text.dat" which contains relative file specs
    fstx,  // containig text version of the above for checking
    fsnk,  // "no_keywords.txt" which contains relative file specs
    fsnd,  // "no_description.txt" which contains relative file specs
    kwtm,  // "keywords.tmp" containing the keywords
    kttx;  // "keywords.txt" which contains all keywords viewable

  /* Declare data output streans For file "filespec_text_pointers.dat"
  which contains the index to "filespec_text.dat" and the output stream
  for the keywords index file "keyword_pointers.tmp". */

  private static DataOutputStream fstp, kidx;

  private static RandomAccessFile 
    skwf,  // for sorting the keywords file keywords.tmp
    skif;  // for sorting the keywords index file keyword_pointers.tmp




  /* The following character substitution method is because, when an accented
  character is written to a file, it is converted from Java's internal Uni-
  code to UTF-8 code. UTF-8 uses 2 bytes for accented characters so the
  length of a string within Java can be different from the number of bytes
  put out in UTF-8. Consequently, I have decreed that all accents shall be
  removed for the purpose of keyword searching. This ensures that all char-
  acters occupy only one byte in UTF-8 and I can therefore make the index
  about half the size it would have to be in Unicode. NOTE: simply zeroing
  the upper byte of the Java Unicode char does not work. It changes the
  actual letter.

  Called from one place in putkw(). */

  private static String removeAccents(String s){
    s = s.replaceAll("[èéêë]","e");
    s = s.replaceAll("[ûùúü]","u"); 
    s = s.replaceAll("[ïîíì]","i");
    s = s.replaceAll("[àáâã]","a"); 
    s = s.replaceAll("[ôóòõ]","o");
    s = s.replaceAll("[ç]","c");
    return s;
  }




  // Called from 3 places in qs(). 

  private static String getkw(int n) throws IOException {

    /* set the pointer to the start of the appropriate index
    record and get the value of the start pointer. */

    skif.seek((long)(n << 3));
    long o = skif.readLong() >>> 16;

    /* Create a byte array the length of the keyword
    and set pointer to start of keyword. */

    byte B[] = new byte[(int)(o & 0xffff)];
    skwf.seek(o >>> 16);

    /* Read the keyword into the byte array and
    return the byte array as a string. */

    skwf.readFully(B);
    return new String(B);
  }




  // Called from 2 places in qs(). 

  private static long getki(int n) throws IOException {

    /* Set the pointer to the start of the appropriate
   index record and return the index record. */

    skif.seek((long)(n << 3));
    return skif.readLong();
  }




  // Called from 2 places in qs(). 

  private static void putki(int n, long l) throws IOException {

    /* Set pointer to start of appropriate index
    record and save the index record. */

    skif.seek((long)(n << 3));
    skif.writeLong(l);
  }




  /* This method takes a single keyword (String w) and writes it as an 
  un-delimited string of bytes to the file "kwtm.tmp". It then writes

  1. the pointer to the start of this keyword (32-bit 'int')

  2. its length (16-bit 'short') length of keyword is assumed
     always to be less than 64k bytes!

  3. the number (16-bit 'short') of the HTML file (as listed in
     "F02") in which it appears to the file 
     "keyword_pointers.tmp" as a single 64-bit 'long'.

  Called from 2 places in putKeywords(). */

  private static void putkw(String w) throws IOException {

    /* Convert keyword to lower case and remove
    accents and write it to the file "keywords.tmp"> */

    String s = removeAccents(w.toLowerCase());
    kwtm.write(s);

    /* Write the offset, length and filenum to "keyword_pointers.tmp"
    and increment the keyword pointer to byte after end of keyword. */

    kidx.writeLong(((long)KeyWrdPtr << 32) + (s.length() << 16) + nfs);
    KeyWrdPtr += w.length();
  }




  /* CHECK THE VALIDITY OF A KEYWORD OR DESCRIPTION TAG. EXAMPLES:
  <meta name="YE572246C" content="kw1, kw2, kw3, kw4, ... ">
  <meta name="description" content="An essay about the universe ... ">
  String 't' contains the whole keywords tag.

  Called from 2 places in getMetaTags(). */

  private static boolean isMetaTag(boolean b, String t) {

    int p, q;  // start and end pointers for chopping string 't'

    if (((p = tag.indexOf("meta")) == -1)        // if not a meta tag
    || ((p = tag.indexOf("name", p + 4)) == -1)  // or has no name
    || ((p = tag.indexOf('=', p + 4)) == -1)     // or has no name= sign

    // or if it is not a tag of the required type: keywords or description
    || ((p = tag.indexOf(t, p + 1)) == -1) 

    // or it has no "contents" parameter : 9 is length of YE572246C
    || ((p = tag.indexOf("content", p + 9)) == -1)

    || ((p = tag.indexOf('=', p + 7)) == -1)    // or has no content= sign
    || ((p = tag.indexOf('\"', p + 1)) == -1)   // or has no opening quote
    || ((q = tag.indexOf('\"', p + 1)) == -1))  // or has no closing quote

      return false;  // it is not a valid content of keywords tag

    /* The boolean 'b' is true for a "YE572246C" tag and false for a 
    "description" tag. So, if it is a "YE572246C" tag... */

    if(b) {
      Keywords = tag.substring(p + 1, q);  // extract keywords
      if(Keywords.equals(""))
        return false;  // no keywords specified
    }
    return true;  // it is valid description or keywords tag
  }




  /* EXAMINE THE CONTENTS OF THE HTML FILE'S <HEAD></HEAD> SECTION
  Called from only one place in scan(). */

  static void getMetaTags(File fo) {

    boolean              // TRUE WHEN:
      inTag = false,     // getting characters that are part of a tag name
      overshot = false;  /* we get to the </head> TAG without finding
                            everything we want */

    /* The following flags are respectively TRUE when the description 
    has been successfully extracted from meta tag and the keywords
    have been successfully extracted from meta tag. */

    Desc = false;
    Kwds = false;

    Keywords = "";  // clear the keywords of the previous HTML file
    int x;          // for receiving java character input from file stream
    char c;         // for each character retrieved from file input stream
    Tag = "";       // raw tag input string
    try {

      // create a file reader for this HTML ile
      FileReader fr = new FileReader(fo);

      while((x = fr.read()) != -1) {  // loop broken by End-Of-File

        /* Get the first/next character from the file stream. If it is an
        initial tag-delimiter '<', set the inTag flag to show that we
        are, from now on, inside an HTML tag. Then, in effect, we loop 
        back to get the next character. */

        c = (char)x;
        if(c == '<')
          inTag = true;

        /* If the character we just pulled is a final tag-delimiter '>', 
        rationalise tag text to lower case ready for comparison. */
        else
        if(c == '>') {
          tag = Tag.toLowerCase();

          /* if the whole of the meta tag containg the HTML document's
          "YE572246C" is now complete and valid, set the "keywords
          received" flag. If the the HTML document's "description"
          has already been received, break out of the while-loop. */

          if(isMetaTag(true, "ye572246c")) {
            Kwds = true; 
            if(Desc) break;
          }
          /* if the whole of the meta tag containg the HTML document's
          "description" is now complete and valid, set the "descrip-
          tion received" flag. If the the HTML document's "YE572246C"
          have already been received, break out of the while-loop. */
          else
          if(isMetaTag(false, "description")) {
            Desc = true;
            if(Kwds)
              break;
          } 
          /* If we've hit the </head> tag without successfully acquirin
          valid "description" and "YE572246C" tags, set the "overshoot"
          flag to indicate this and break out of the while-loop. */
          else
          if(tag.equals("/head")) {
            overshot = true;
            break;
          }
          Tag = "";       // Clear and reset ready for
          inTag = false;  // the next tag to be encountered
        }
        else         // otherwise
        if(inTag)    // if currently inside a tag,
          Tag += c;  // simply add the current character to the tag name
      }              // End of while()
      fr.close();    // close the file reader
    }
    catch(Exception e) { }  // catches end-of-file exception
  }




  /* This method takes a string 's' of comma-delimited keywords. It extracts
  and trims each keyword in turn from 's' and stores it in the file "key-
  words.tmp". Note that the last keyword has to be handled separately be-
  cause it is not terminated by a comma like the others.

  Called from only one place in scan(). */

  private static int putKeywords(String s) throws IOException {
    int
      n = 0,  // number of keywords in this tag
      x = 0,  // pointer to start of a single keyword
      y;      // pointer to finish of a single keyword

    String
      t,      // for a single keyword
      Q;      // for text file line padding

    /* While we can still find a further comma, extract and trim the next
    individual keyword and store it in the file "kwtm.tmp". Increment
    the number of retrieved keywords. NOTE: trim() removes all types
    of whitespace characters, including space, newline, tab, etc.. 
    So don't worry if there are odd line-breaks within the comma-
    delimited keywords list inside the keywords meta tag. */

    while((y = s.indexOf(",", x)) != -1) {
      t = (s.substring(x, y)).trim();
      putkw(t);
      n++;

      /* If the trimmed keyword contains a space character,
      it is bad so write its file path to F00 */

      if(t.indexOf(' ') != -1)
        bkwf.write(BaseDir + '/' + RelFileSpec + '\n');

      /* Calculate how much padding space is needed to display the keyword
      visually. Then pad out the keyword, attach path + filename of the
      file in which it occurs and store this in "kttx.txt" so that it can
      be visually checked. Then advance 'x' to the character following
      the current comma. */

      Q = padding.substring(t.length());
      kttx.write(t + Q + BaseDir + '/' + RelFileSpec + '\n');
      x = y + 1;
    } 

    /* Extract, trim and store the final individual keyword, which has no
    terminating comma. Pad out the keyword, attach path + filename of
    the file in which it occurs and storte this in "kttx.txt". Add final
    keyword to the number of keywords in this HTML file. */

    t = (s.substring(x, s.length())).trim();
    putkw(t);
    Q = padding.substring(t.length());
    kttx.write(t + Q + BaseDir + '/' + RelFileSpec + '\n');
    n++; 
    return n;
  }




  /* This method is re-entrant. It calls itself. When invoked, it lists the
  files and directories contained within the directory 'FullPath' passed
  to it as its parameter. It then examines each entry in that directory. If
  an entry is a relevant HTML file, it writes that file's relative filespec
  to the file "filespec_text.dat". It then places the filespec's offset 'p'
  as an entry in the index file "filespec_text_pointers.dat". It writes 'p'
  as a 32-bit integer. The relative filespec is the path + filename from the
  point of view of the parent directory. If an entry is a directory, it simply
  calls itself to deal with that (sub) directory as it is doing with the cur-
  rent directory. Thus it can handle any depth of sub-directories from the
  parent. Only HTML files with keyword meta tags are placed in the index: the
  others are printed to the file no_keywords.txt. HTML files lacking a descr-
  iption meta tag are logged in the file "no_description.txt". 

  /home/rob/Private/website/book/chap01/chap01/chap01_frame.htm
                           |----------------l-----------------|

                        LenDir                            s.length()

  Called only by itself and extractKwds() */

  private static void scan(String d) throws IOException {

    File fd = new File(d);   // create file object for given directory name
    String D[] = fd.list();  // list all files and sub-directories in this
                             // directory

    // for each HTML file in the CURRENT sub-directory
    for(int i = 0; i < D.length; i++) {

      String                // FOR THE [NEXT] FILE OR SUB-DIRECTORY, CREATE:
        dd = D[i],          // a string to contain its relative path name
        fp = d + "/" + dd;  // a string to contain its file-path

      File fo = new File(fp);  // create a file object for it
      if(fo.isDirectory()) {   // if the object is a directory
        boolean flag = false;  /* FALSE indicates that this is NOT one
                                  of the directories to be indexed. */

        /* If we are in the top-level directory of the website THEN if
        the ith sub-directory is one of those to be indexed, set the
        flag to indicate that this sub-directory must be indexed,
        else we cannot be in the top-level directory, so set the flag

        to indicate this and go index the subdirectory anyway. */

        if(dir_level == 0) {
          for(int j = 0; j < FOLDERS.length; j++)
            if(dd.equals(FOLDERS[j])) {
              flag = true; break;
            }
        }
        else flag = true;

        /* Provided this is one of the directories to be indexed and it
        is not an images, applets or java_progs directory, then... */

        if(flag && (dd.indexOf("images") == -1)
                && (dd.indexOf("applets") == -1)
                && (dd.indexOf("java_progs") == -1)) {

          dir_level++;  // Increment directory level from the one we're in,
          scan(fp);     // re-enter this method, then on return,
          dir_level--;  // decrement the directory level back to this one.
        }
        if(abrt) return;  // for re-entrant returns after the first abrt
      }
      else                                 // Otherwise [it's not a directory]
      if(fo.isFile()                       // so if it's an existing file
      && fp.endsWith(".html")              // and it is an HTML file
      && (fp.indexOf("index") == -1)       // but not an index file
      && (fp.indexOf("noindex") == -1)     // or a noindex file
      && (fp.indexOf("contents") == -1)    // or a contents file
      && !fp.endsWith("error404.html")     // or an error notification file
      && !fp.endsWith("error404_br.html")  // and it isn't at the top level,
      && (dir_level > 0) ) {               // extract the file's keywords
        getMetaTags(fo);                   // and description meta tags.

        if(Keywords.equals(""))   // If this file has no keywords,
          fsnk.write(fp + "\n");  // write the absolute rejected 
                                  // filespec to no_keywords.txt.

        /* Provided we have not so far overshot the capacity of the 16-bit
        pointer, get the file's relative filespec (relative path + filename),
        write it to filespec_text.dat and as text to filespecs.txt. Write
        the files 32-bit offset pointer to filespec_pointers.dat. */

        else
        if(nfs < NFS) {
          RelFileSpec = fp.substring(LenDir);
          fsts.write(RelFileSpec);
          fstx.write(RelFileSpec +'\n');
          fstp.writeInt(FilePtr);

          /* Advance the file pointer to the byte after last byte in the
          current filespec. Write the the keywords to keywords.tmp, 
          keyword_pointers.tmp and keywords.txt then increment the
          filespec reference number for the next filespec.*/

          FilePtr += RelFileSpec.length();
          int numkws = putKeywords(Keywords);
          nfs++;

          /* Pad the number of keywords with leading zeros and write the num-
          ber of the keyword and filespec to "keywords_per_HTML_file.txt */

          String Numkws = "" + numkws;
          if(numkws > 999)
            Numkws = "XXX";
          else if(numkws < 10)
            Numkws = "00" + Numkws;
          else if(numkws < 100)
            Numkws = "0" + Numkws;
          kwpf.write(Numkws + "  " + fp + "\n");

          /* if this HTML file doesn't have a description tag, write
          its relative filespec to "no_descriptions.txt". */

          if(!Desc)
            fsnd.write(fp + "\n");

        } else {
          abrt = true;  // abort because more than 64k files
          break;        // and only got 16-bit pointer.
        }
      }
    }    // end of for() loop
  }




  /* The following method embodies C A R Hoare's Quick Sort algorithm. However,
  instead of sorting the elements of an array, it sorts the elements of "key-
  word_pointers.tmp". These are offset-extent pairs which point the actual
  keywords being sorted. The keywords themselves stay where they are: only
  the pointers are sorted. Note that it is a highly re-entrant method: it
  calls itself indefinitely.

  Called only by iteslf and from one place in sortKwPtrs(). */

  static void qs(int LO, int HI) throws IOException {
    int
      lo = LO,  // set moving lo to LO end of partition
      hi = HI;  // set moving hi to HI end of partition

    if (HI <= LO) return;  // exit if the partition contains nothing

    // get the content of the mid element of the partition
    String mid = getkw((LO + HI) >> 1);

    while(lo <= hi) {  // loop through the array until its indices cross

      /* While current lowest keyword < midway keyword push lower sort
      boundary up by one element. While current highest keyword > 
      midway keyword pull upper sort boundary down by one element. */

      while(lo < HI && getkw(lo).compareTo(mid) < 0) lo++;
      while(hi > LO && getkw(hi).compareTo(mid) > 0) hi--;

      if(lo <= hi) {  // IF LOW INDEX <= HIGH INDEX SWAP THEIR 'CONTENTS'

        /* Get index (offset-extent-filenum) of lo element, put index of hi
        element in lo element, put index of lo element in hi element. */

        long x = getki(lo);
        putki(lo,getki(hi));
        putki(hi,x);

        lo++;  // push lower sort boundary up by one element and
        hi--;  // pull upper sort boundary down by one element
      }
    }
    if(LO < hi)    // If 'hi' not yet reached start of file,
      qs(LO, hi);  // sort lower partition and
    if(lo < HI)    // if 'lo' not yet reached end of file,
      qs(lo, HI);  // sort upper partition.
  }





  // METHODS CALLED ONLY BY main() --------------------------------------------


  /* OPENS ALL FILES FOR LISTING THE FILESPECS OF ALL THE
  FILES IN THE WEBSITE AND THEN EXTRACTING THE KEYWORDS
  FROM THEM. Called only from one place in main(). */

  private static void extractKwds() throws IOException {
    bkwf = new FileWriter(F00);
    kwpf = new FileWriter(F01);  // Create and open all files
    fsts = new FileWriter(F02);
    fstx = new FileWriter(F03);
    fsnk = new FileWriter(F04);
    fsnd = new FileWriter(F05);
    fstp = new DataOutputStream(new FileOutputStream(F06));
    kwtm = new FileWriter(F07);
    kidx = new DataOutputStream(new FileOutputStream(F08)); 
    kttx = new FileWriter(F09);

    scan(FullPath);  // Create the index and write the data into the files

    if(!abrt)                 // Write the final pointer (points to
      fstp.writeInt(FilePtr);  // the byte beyond the end of the file)

    fsnd.close();  // Close all files
    fsnk.close();
    fsts.close();
    fstx.close();
    fstp.close();
    kidx.close();
    kwtm.close();
    kttx.close();
    kwpf.close();
    bkwf.close();
  }




  /* Keyword File Sorter. An adaptation C A R Hoare's Quick Sort algorithm.
  It handles files that are already sorted, and files with duplicate key-
  words. Note that this program only sorts the elements of the keyword index
  file "keyword_pointers.tmp". It does NOT sort the actual keywords in
  "keywords.tmp". Called from only one place in main(). */

  private static void sortKwPtrs() throws IOException {

    /* Open "keyword_pointers.tmp" for random read and write and
    "keywords.tmp" for read-only. Then sort the pointers into
    alphabetical order of keywords and close the keywords and
    keyword_pointers files. */

    skif = new RandomAccessFile(F08,"rw");
    skwf = new RandomAccessFile(F07,"r");
    qs(0,(int)((skif.length() >> 3) - 1)); 
    skwf.close();
    skif.close();
  }




  /* HTML KEYWORD FILES BUILDER:  

  keyword_pointers.tmp   ooooeeff
  oooo = 4-byte (32-bit) offset of keyword within "keywords.dat"
    ee = 2-byte (16-bit) length of keyword within "keywords.dat"
    ff = 2-byte (16-bit) number within F06 of the HTML
         file in whose meta tag this occurrence of this keyword was found. 

  keywords.tmp   
  contains variable-length keywords whose offsets and lengths are given in
  "keyword_pointers.tmp". Note that a given word may appear many times in
  "keywords.tmp" because an entry is made for each occurrence of each key-
  word within the meta tags of all HTML files in the web site. Note that the
  keywords in "keywords.tmp" are NOT in alphabetical order. However, the
  pointers "oooo" in "keyword_pointers.tmp" point to them in alphabetical
  order. 

  This program now compacts the above information into another set of files
  as follows:

  KEYWORD TEXT POINTERS
  keyword_pointers.dat   pppp
  pppp = 4-byte (32-bit) offset of a keyword in "keywords.dat"

  KEY WORD TEXT STREAM
  keyword_text.dat
  contains variable-length keywords whose offsets are held in "kwtm.tp".
  Note that in "keyword_text.dat", each keyword appears once only. It
  is therefore considerably shorter than "keywords.tmp".

  RELATIVE FILESPEC POINTERS to REFERENCE NUMBERS
  filespec_ref_pointers.dat   qqqq
  qqqq = 4-byte (32-bit) offset of the reference number of the HTML file
  within which this keyword first occurred. 

  RELATIVE FILESPEC REFERENCE NUMBERS
  filespec_ref_numbers.dat     ff
  ff = 2-byte (16-bit) file number of an HTML file whose meta tag contains
  a particular keyword. There is an entry in this file for every occurrence
  of every keyword.

  Called from only one place in main(). */

  private static void makeKwIdxs() throws IOException {

    DataInputStream
      kit = new DataInputStream(new FileInputStream("keyword_pointers.tmp"));
    RandomAccessFile
      kwt = new RandomAccessFile(F07,"r");
    OutputStream
      kwts = new FileOutputStream(F10);
    DataOutputStream
      kwtp = new DataOutputStream(new FileOutputStream(F11)),
      fsrp = new DataOutputStream(new FileOutputStream(F12)),
      fsrn = new DataOutputStream(new FileOutputStream(F13));
    File
      f = new File("keyword_pointers.tmp"),
      g = new File(F07);

    int       // POINTER TO:
      p = 0,  // current keyword in keyword_text.dat
      q = 0,  // current file number in "filespec_ref_numbers.dat"

    N = (int)f.length() >> 3;  // number of 8-byte records in
                               // "keyword_pointers.tmp"

    String w = "";  // holder for current keyword

    /* For each entry in "keywords.tmp", get its 8-byte content,
    write HTML file's reference number to "filespec_ref_numbers.dat". */

    for(int n = 0; n < N; n++) {
      long o = kit.readLong();
      fsrn.writeShort((short)(o & 0xffff));

      int e = (int)((o >>>= 16) & 0xffff);  // extract length of keyword

      byte B[] = new byte[e];  // create a new byte buffer to receive it
      kwt.seek(o >>> 16);      // move to start of keyword in "keywords.tmp"
      kwt.read(B);             // read the keyword into the byte buffer

      String s = new String(B);  // copy it into a new string, s

      if(!w.equals(s)) {   // If it's a new keyword, then
        kwts.write(B);     // write it to "keyword_text.dat", its
        kwtp.writeInt(p);  // pointer to "keyword_pointers.dat" ant its
        fsrp.writeInt(q);  // file number to "filespec_ref_pointers.dat".

        p += e;  // Shift pointer to byte beyond this keyword in
        w = s;   // "keyword_text.dat" and make new keyword the current one.
      }
      q += 2;    // advance filenum pointer for "filespec_ref_pointers.dat"
    }
    /* Note that these pointers point to the byte beyond the last
    keyword and the byte beyond the last file number in "keyword_text.dat"
    and "filespec_ref_numbers.dat" respectively. */

    kwtp.writeInt(p);  // write final end-pointer to keyword_pointers.dat
    fsrp.writeInt(q);  // write final end-pointer to filespec_ref_numbers.dat

    kwtp.close();
    kwts.close();
    fsrp.close(); 
    fsrn.close();
    kwt.close();
    kit.close();

    f.delete();  // delete the temporary files
    g.delete();
  }




  /* COMPACT ALL FILES INTO A SINGLE INDEX.DAT FILE

  The search engine index file set comprises:
  1. keyword_pointers.dat
  2. keyword_text.dat
  3. filespec_ref_pointers.dat
  4. filespec_ref_numbers.dat
  5. filespec_text_pointers.dat
  6. F02   filespec text stream file

  This program joins the above into a single file index.dat. Its full 
  path is ../../index.dat to place it in the website's home directory. */

  private static void concat() throws IOException {
    int x;
    File f1 = new File(F11);
    File f2 = new File(F10);
    File f3 = new File(F12);
    File f4 = new File(F13);
    File f5 = new File(F06);
    File f6 = new File(F02);

    InputStream kwtp = new FileInputStream(f1);
    InputStream kwts = new FileInputStream(f2);
    InputStream fsrp = new FileInputStream(f3);
    InputStream fsrn = new FileInputStream(f4);
    InputStream fstp = new FileInputStream(f5);
    InputStream fsts = new FileInputStream(f6);

    DataOutputStream
      indx = new DataOutputStream(new FileOutputStream("index.dat"));

    /* Extent of keyword pointers (bytes) plus extent of keywords text (bytes)
    plus extent of pointers to filespec reference numbers (bytes) plus extent
    of filespec reference numbers plus extent of filespecs text (bytes). */

    int a = (int)f1.length(); indx.writeInt(x  = a);
    int b = (int)f2.length(); indx.writeInt(x += b);
    int c = (int)f3.length(); indx.writeInt(x += c);
    int d = (int)f4.length(); indx.writeInt(x += d);
    int e = (int)f5.length(); indx.writeInt(x +  e);
    int f = (int)f6.length();                

    byte B[];  // writes string as bytes to a file
    B = new byte[a]; kwtp.read(B); indx.write(B,0,a);
    B = new byte[b]; kwts.read(B); indx.write(B,0,b);
    B = new byte[c]; fsrp.read(B); indx.write(B,0,c);
    B = new byte[d]; fsrn.read(B); indx.write(B,0,d);
    B = new byte[e]; fstp.read(B); indx.write(B,0,e);
    B = new byte[f]; fsts.read(B); indx.write(B,0,f);

    // CLOSE ALL FILES
    indx.close();
    fsts.close();
    fstp.close();
    fsrn.close();
    fsrp.close();
    kwts.close();
    kwtp.close();

    // DELETE ALL EXCEPT INDEX.DAT
    f1.delete();
    f2.delete();
    f3.delete(); 
    f4.delete();
    f5.delete();
    f6.delete(); 
  } 




  public static void main(String args[]) throws IOException {

    int L = args.length;  // number of command-line arguments

    /* Provided a command line argument has been entered, get the name of
    the website's base directory from command line. If there is a second
    command line argument, it is the name of a sub directory of the site
    which should be added to the search path. */

    if(args.length > 0) {
      BaseDir = args[0]; 
      if(args.length > 1)
        SubDir = args[1];
    }
    /* If a search sub-directory has been specified, form the full path,
    otherwise assume indexing starts at the base directory level. */

    if(SubDir != "")
      FullPath = BaseDir + "/" + SubDir;
    else
      FullPath = BaseDir;

    // Form a file object for the directory to be searched.
    LenDir = FullPath.length() + 1;
    File pd = new File(FullPath);

    // Provided that the command line argument is an existing directory
    if(pd.isDirectory()) {
      extractKwds();
      if(abrt) {
        System.out.println("More than 64k HTML files.");
        System.out.println("Modify spider.java for 32-bit");
        System.out.println("filespec text pointers.");
      } else {
        sortKwPtrs();  // sort the keyword pointers
        makeKwIdxs();  // generate keyword index files
        concat();      // Concatenate all index components
      }                // into a single index.dat file.
    }
    else System.out.println(FullPath + " is not a directory.");
  }
}