/** * HTML SPIDER * @author Robert John Morton * @version 03 Oct 1999 modified 13 July 2009 revamped 16 July 2009 */ /* BUILDS A COMPACT INDEX (index.dat) FOR THE SEARCH ENGINE APPLET Extracts the keywords from the HTML keyword meta tag of every relevant HTML file in the website and builds a compact index file (index.dat) for use by my search-engine applets 'search.java' and 'search.c'. This program follows the directory structure of the website: it does not use or follow HTML links. If you do not want a file to be indexed, make sure it has no keywords meta tag. It strictly only looks for keywords within files whose names end with ".html" and not ".htm". This program is automatically called by the index-website.sh script in the home directory and in '/home/rob/Private/computer/Bash/'. The script re-encapsulates the index.dat file, together with the full search engine applet 'search.java', into a JAR file, which it places in the top level directory of the website. In fact, the script creates 2 versions: one for the embedded applet and another for the stand-alone WebStart application. In keywords meta tag contents ="", all individual words must be comma- delimited: no groups of words separated by only spaces. COMMAND: cd /home/rob/Private/website/webtools/search-engine java spider You have the option to enter the absolute path to the website directory as a command-line argument. Do NOT put a final slash. If no path is entered, the program uses the default website directory: /home/rob/Private/website You can enter a second agument: a sub-directory path below the website directory specified in the first arguement. This is a way of limiting the indexing to only a specified sub-directory. The relative filespecs produced will, however, still be relative to the entered website directory (the first arguement). This program generates the following files: filespec_text.dat contains the relative filespecs of all HTML files in the parent and all its subdirectories. filespecs.txt contains the relative filespecs as in "filespec _text.dat" but in text form for visual checking. filespec_pointers.dat contains the offset for each relative filespec held in filespecs.txt. An extra offset is included at the end of "filespec_text_pointers.dat" to make it easy to find the extent of the last filespec in filespecs.txt. The maximum size of filespecs.txt is determined by the 32-bit size of the offset pointers. An attempt was made to compact the filespecs file by listing directories separately and replacing the filepaths by references to the appropriate directories in the separate directories file. However, this saved only 687 bytes in the whole website index file. The extra code necessary in "search.java" to handle it would have required a lot more than this. This idea was therefore discarded. keywords.tmp contains every occurrence of every keyword in the website (converted entirely to lower case). keyword_pointers.tmp comprises 8-byte records, each of which contains the offset of a keyword within "kwtm.tmp" (32-bits) + the extent of the keyword (16-bits) and the ref- erence number (16-bits) of the HTML file in whose keyword meta tag the keyword occurred. no_keywords.txt contains filespecs of all relevant HTML files that lack a keywords meta tag. no_description.txt contains the filespecs of all accepted HTML files that lack a description. */ import java.io.*; // for file input/output handling class spider { private static int dir_level = 0, // current directory level used in scan() KeyWrdPtr, // pointer to current keyword in keywords.tmp LenDir, // length of parent directory path + a terminating '/' FilePtr = 0, // pointer for the file filespecs.txt NFS = 0xffff, // max number of filespecs possible in the filespec index nfs = 0; /* number of the current filespec within filespec_text.dat, filespecs.txt */ private static boolean // SET TO TRUE WHEN: Kwds = false, // current HTML file has keywords Desc = false, // current HTML file has a description abrt = false; // run is aborted due to file pointer overflow private static String /* The names of all the top-level directories in the website that must be indexed. There are other top-level directories which are excluded from the indexing process for one reason or another. */ FOLDERS[] = {"book","chaos","computers","home","internet","landshare", "navigation","poems","radio","science","software"}, // FILE NAMES F00 = "badKeyWords.txt", F01 = "number_of_keywords_per_HTML_file.txt", F02 = "filespec_text.dat", F03 = "filespecs.txt", F04 = "no_keywords.txt", F05 = "no_descriptions.txt", F06 = "filespec_pointers.dat", F07 = "keywords.tmp", F08 = "keyword_pointers.tmp", F09 = "keywords.txt", F10 = "keyword_text.dat", F11 = "keyword_pointers.dat", F12 = "filespec_ref_pointers.dat", F13 = "filespec_ref_numbers.dat", BaseDir = "/home/rob/Private/website", // the default base directory Tag, tag, // text of current meta tag, plus lower-case version SubDir = "", // sub-directory FullPath = "", // absolute path to directory currently being indexed RelFileSpec = "", // relative filespec of current HTML file Keywords = "", // current file's comma-delimited keywords padding = " "; private static Writer bkwf, // contains paths to HTML files containg bad keywords kwpf, // contains "number_of_keywords_per_HTML_file" fsts, // "filespec_text.dat" which contains relative file specs fstx, // containig text version of the above for checking fsnk, // "no_keywords.txt" which contains relative file specs fsnd, // "no_description.txt" which contains relative file specs kwtm, // "keywords.tmp" containing the keywords kttx; // "keywords.txt" which contains all keywords viewable /* Declare data output streans For file "filespec_text_pointers.dat" which contains the index to "filespec_text.dat" and the output stream for the keywords index file "keyword_pointers.tmp". */ private static DataOutputStream fstp, kidx; private static RandomAccessFile skwf, // for sorting the keywords file keywords.tmp skif; // for sorting the keywords index file keyword_pointers.tmp /* The following character substitution method is because, when an accented character is written to a file, it is converted from Java's internal Uni- code to UTF-8 code. UTF-8 uses 2 bytes for accented characters so the length of a string within Java can be different from the number of bytes put out in UTF-8. Consequently, I have decreed that all accents shall be removed for the purpose of keyword searching. This ensures that all char- acters occupy only one byte in UTF-8 and I can therefore make the index about half the size it would have to be in Unicode. NOTE: simply zeroing the upper byte of the Java Unicode char does not work. It changes the actual letter. Called from one place in putkw(). */ private static String removeAccents(String s){ s = s.replaceAll("[èéêë]","e"); s = s.replaceAll("[ûùúü]","u"); s = s.replaceAll("[ïîíì]","i"); s = s.replaceAll("[àáâã]","a"); s = s.replaceAll("[ôóòõ]","o"); s = s.replaceAll("[ç]","c"); return s; } // Called from 3 places in qs(). private static String getkw(int n) throws IOException { /* set the pointer to the start of the appropriate index record and get the value of the start pointer. */ skif.seek((long)(n << 3)); long o = skif.readLong() >>> 16; /* Create a byte array the length of the keyword and set pointer to start of keyword. */ byte B[] = new byte[(int)(o & 0xffff)]; skwf.seek(o >>> 16); /* Read the keyword into the byte array and return the byte array as a string. */ skwf.readFully(B); return new String(B); } // Called from 2 places in qs(). private static long getki(int n) throws IOException { /* Set the pointer to the start of the appropriate index record and return the index record. */ skif.seek((long)(n << 3)); return skif.readLong(); } // Called from 2 places in qs(). private static void putki(int n, long l) throws IOException { /* Set pointer to start of appropriate index record and save the index record. */ skif.seek((long)(n << 3)); skif.writeLong(l); } /* This method takes a single keyword (String w) and writes it as an un-delimited string of bytes to the file "kwtm.tmp". It then writes 1. the pointer to the start of this keyword (32-bit 'int') 2. its length (16-bit 'short') length of keyword is assumed always to be less than 64k bytes! 3. the number (16-bit 'short') of the HTML file (as listed in "F02") in which it appears to the file "keyword_pointers.tmp" as a single 64-bit 'long'. Called from 2 places in putKeywords(). */ private static void putkw(String w) throws IOException { /* Convert keyword to lower case and remove accents and write it to the file "keywords.tmp"> */ String s = removeAccents(w.toLowerCase()); kwtm.write(s); /* Write the offset, length and filenum to "keyword_pointers.tmp" and increment the keyword pointer to byte after end of keyword. */ kidx.writeLong(((long)KeyWrdPtr << 32) + (s.length() << 16) + nfs); KeyWrdPtr += w.length(); } /* CHECK THE VALIDITY OF A KEYWORD OR DESCRIPTION TAG. EXAMPLES: String 't' contains the whole keywords tag. Called from 2 places in getMetaTags(). */ private static boolean isMetaTag(boolean b, String t) { int p, q; // start and end pointers for chopping string 't' if (((p = tag.indexOf("meta")) == -1) // if not a meta tag || ((p = tag.indexOf("name", p + 4)) == -1) // or has no name || ((p = tag.indexOf('=', p + 4)) == -1) // or has no name= sign // or if it is not a tag of the required type: keywords or description || ((p = tag.indexOf(t, p + 1)) == -1) // or it has no "contents" parameter : 9 is length of YE572246C || ((p = tag.indexOf("content", p + 9)) == -1) || ((p = tag.indexOf('=', p + 7)) == -1) // or has no content= sign || ((p = tag.indexOf('\"', p + 1)) == -1) // or has no opening quote || ((q = tag.indexOf('\"', p + 1)) == -1)) // or has no closing quote return false; // it is not a valid content of keywords tag /* The boolean 'b' is true for a "YE572246C" tag and false for a "description" tag. So, if it is a "YE572246C" tag... */ if(b) { Keywords = tag.substring(p + 1, q); // extract keywords if(Keywords.equals("")) return false; // no keywords specified } return true; // it is valid description or keywords tag } /* EXAMINE THE CONTENTS OF THE HTML FILE'S SECTION Called from only one place in scan(). */ static void getMetaTags(File fo) { boolean // TRUE WHEN: inTag = false, // getting characters that are part of a tag name overshot = false; /* we get to the TAG without finding everything we want */ /* The following flags are respectively TRUE when the description has been successfully extracted from meta tag and the keywords have been successfully extracted from meta tag. */ Desc = false; Kwds = false; Keywords = ""; // clear the keywords of the previous HTML file int x; // for receiving java character input from file stream char c; // for each character retrieved from file input stream Tag = ""; // raw tag input string try { // create a file reader for this HTML ile FileReader fr = new FileReader(fo); while((x = fr.read()) != -1) { // loop broken by End-Of-File /* Get the first/next character from the file stream. If it is an initial tag-delimiter '<', set the inTag flag to show that we are, from now on, inside an HTML tag. Then, in effect, we loop back to get the next character. */ c = (char)x; if(c == '<') inTag = true; /* If the character we just pulled is a final tag-delimiter '>', rationalise tag text to lower case ready for comparison. */ else if(c == '>') { tag = Tag.toLowerCase(); /* if the whole of the meta tag containg the HTML document's "YE572246C" is now complete and valid, set the "keywords received" flag. If the the HTML document's "description" has already been received, break out of the while-loop. */ if(isMetaTag(true, "ye572246c")) { Kwds = true; if(Desc) break; } /* if the whole of the meta tag containg the HTML document's "description" is now complete and valid, set the "descrip- tion received" flag. If the the HTML document's "YE572246C" have already been received, break out of the while-loop. */ else if(isMetaTag(false, "description")) { Desc = true; if(Kwds) break; } /* If we've hit the tag without successfully acquirin valid "description" and "YE572246C" tags, set the "overshoot" flag to indicate this and break out of the while-loop. */ else if(tag.equals("/head")) { overshot = true; break; } Tag = ""; // Clear and reset ready for inTag = false; // the next tag to be encountered } else // otherwise if(inTag) // if currently inside a tag, Tag += c; // simply add the current character to the tag name } // End of while() fr.close(); // close the file reader } catch(Exception e) { } // catches end-of-file exception } /* This method takes a string 's' of comma-delimited keywords. It extracts and trims each keyword in turn from 's' and stores it in the file "key- words.tmp". Note that the last keyword has to be handled separately be- cause it is not terminated by a comma like the others. Called from only one place in scan(). */ private static int putKeywords(String s) throws IOException { int n = 0, // number of keywords in this tag x = 0, // pointer to start of a single keyword y; // pointer to finish of a single keyword String t, // for a single keyword Q; // for text file line padding /* While we can still find a further comma, extract and trim the next individual keyword and store it in the file "kwtm.tmp". Increment the number of retrieved keywords. NOTE: trim() removes all types of whitespace characters, including space, newline, tab, etc.. So don't worry if there are odd line-breaks within the comma- delimited keywords list inside the keywords meta tag. */ while((y = s.indexOf(",", x)) != -1) { t = (s.substring(x, y)).trim(); putkw(t); n++; /* If the trimmed keyword contains a space character, it is bad so write its file path to F00 */ if(t.indexOf(' ') != -1) bkwf.write(BaseDir + '/' + RelFileSpec + '\n'); /* Calculate how much padding space is needed to display the keyword visually. Then pad out the keyword, attach path + filename of the file in which it occurs and store this in "kttx.txt" so that it can be visually checked. Then advance 'x' to the character following the current comma. */ Q = padding.substring(t.length()); kttx.write(t + Q + BaseDir + '/' + RelFileSpec + '\n'); x = y + 1; } /* Extract, trim and store the final individual keyword, which has no terminating comma. Pad out the keyword, attach path + filename of the file in which it occurs and storte this in "kttx.txt". Add final keyword to the number of keywords in this HTML file. */ t = (s.substring(x, s.length())).trim(); putkw(t); Q = padding.substring(t.length()); kttx.write(t + Q + BaseDir + '/' + RelFileSpec + '\n'); n++; return n; } /* This method is re-entrant. It calls itself. When invoked, it lists the files and directories contained within the directory 'FullPath' passed to it as its parameter. It then examines each entry in that directory. If an entry is a relevant HTML file, it writes that file's relative filespec to the file "filespec_text.dat". It then places the filespec's offset 'p' as an entry in the index file "filespec_text_pointers.dat". It writes 'p' as a 32-bit integer. The relative filespec is the path + filename from the point of view of the parent directory. If an entry is a directory, it simply calls itself to deal with that (sub) directory as it is doing with the cur- rent directory. Thus it can handle any depth of sub-directories from the parent. Only HTML files with keyword meta tags are placed in the index: the others are printed to the file no_keywords.txt. HTML files lacking a descr- iption meta tag are logged in the file "no_description.txt". /home/rob/Private/website/book/chap01/chap01/chap01_frame.htm |----------------l-----------------| LenDir s.length() Called only by itself and extractKwds() */ private static void scan(String d) throws IOException { File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all files and sub-directories in this // directory // for each HTML file in the CURRENT sub-directory for(int i = 0; i < D.length; i++) { String // FOR THE [NEXT] FILE OR SUB-DIRECTORY, CREATE: dd = D[i], // a string to contain its relative path name fp = d + "/" + dd; // a string to contain its file-path File fo = new File(fp); // create a file object for it if(fo.isDirectory()) { // if the object is a directory boolean flag = false; /* FALSE indicates that this is NOT one of the directories to be indexed. */ /* If we are in the top-level directory of the website THEN if the ith sub-directory is one of those to be indexed, set the flag to indicate that this sub-directory must be indexed, else we cannot be in the top-level directory, so set the flag to indicate this and go index the subdirectory anyway. */ if(dir_level == 0) { for(int j = 0; j < FOLDERS.length; j++) if(dd.equals(FOLDERS[j])) { flag = true; break; } } else flag = true; /* Provided this is one of the directories to be indexed and it is not an images, applets or java_progs directory, then... */ if(flag && (dd.indexOf("images") == -1) && (dd.indexOf("applets") == -1) && (dd.indexOf("java_progs") == -1)) { dir_level++; // Increment directory level from the one we're in, scan(fp); // re-enter this method, then on return, dir_level--; // decrement the directory level back to this one. } if(abrt) return; // for re-entrant returns after the first abrt } else // Otherwise [it's not a directory] if(fo.isFile() // so if it's an existing file && fp.endsWith(".html") // and it is an HTML file && (fp.indexOf("index") == -1) // but not an index file && (fp.indexOf("noindex") == -1) // or a noindex file && (fp.indexOf("contents") == -1) // or a contents file && !fp.endsWith("error404.html") // or an error notification file && !fp.endsWith("error404_br.html") // and it isn't at the top level, && (dir_level > 0) ) { // extract the file's keywords getMetaTags(fo); // and description meta tags. if(Keywords.equals("")) // If this file has no keywords, fsnk.write(fp + "\n"); // write the absolute rejected // filespec to no_keywords.txt. /* Provided we have not so far overshot the capacity of the 16-bit pointer, get the file's relative filespec (relative path + filename), write it to filespec_text.dat and as text to filespecs.txt. Write the files 32-bit offset pointer to filespec_pointers.dat. */ else if(nfs < NFS) { RelFileSpec = fp.substring(LenDir); fsts.write(RelFileSpec); fstx.write(RelFileSpec +'\n'); fstp.writeInt(FilePtr); /* Advance the file pointer to the byte after last byte in the current filespec. Write the the keywords to keywords.tmp, keyword_pointers.tmp and keywords.txt then increment the filespec reference number for the next filespec.*/ FilePtr += RelFileSpec.length(); int numkws = putKeywords(Keywords); nfs++; /* Pad the number of keywords with leading zeros and write the num- ber of the keyword and filespec to "keywords_per_HTML_file.txt */ String Numkws = "" + numkws; if(numkws > 999) Numkws = "XXX"; else if(numkws < 10) Numkws = "00" + Numkws; else if(numkws < 100) Numkws = "0" + Numkws; kwpf.write(Numkws + " " + fp + "\n"); /* if this HTML file doesn't have a description tag, write its relative filespec to "no_descriptions.txt". */ if(!Desc) fsnd.write(fp + "\n"); } else { abrt = true; // abort because more than 64k files break; // and only got 16-bit pointer. } } } // end of for() loop } /* The following method embodies C A R Hoare's Quick Sort algorithm. However, instead of sorting the elements of an array, it sorts the elements of "key- word_pointers.tmp". These are offset-extent pairs which point the actual keywords being sorted. The keywords themselves stay where they are: only the pointers are sorted. Note that it is a highly re-entrant method: it calls itself indefinitely. Called only by iteslf and from one place in sortKwPtrs(). */ static void qs(int LO, int HI) throws IOException { int lo = LO, // set moving lo to LO end of partition hi = HI; // set moving hi to HI end of partition if (HI <= LO) return; // exit if the partition contains nothing // get the content of the mid element of the partition String mid = getkw((LO + HI) >> 1); while(lo <= hi) { // loop through the array until its indices cross /* While current lowest keyword < midway keyword push lower sort boundary up by one element. While current highest keyword > midway keyword pull upper sort boundary down by one element. */ while(lo < HI && getkw(lo).compareTo(mid) < 0) lo++; while(hi > LO && getkw(hi).compareTo(mid) > 0) hi--; if(lo <= hi) { // IF LOW INDEX <= HIGH INDEX SWAP THEIR 'CONTENTS' /* Get index (offset-extent-filenum) of lo element, put index of hi element in lo element, put index of lo element in hi element. */ long x = getki(lo); putki(lo,getki(hi)); putki(hi,x); lo++; // push lower sort boundary up by one element and hi--; // pull upper sort boundary down by one element } } if(LO < hi) // If 'hi' not yet reached start of file, qs(LO, hi); // sort lower partition and if(lo < HI) // if 'lo' not yet reached end of file, qs(lo, HI); // sort upper partition. } // METHODS CALLED ONLY BY main() -------------------------------------------- /* OPENS ALL FILES FOR LISTING THE FILESPECS OF ALL THE FILES IN THE WEBSITE AND THEN EXTRACTING THE KEYWORDS FROM THEM. Called only from one place in main(). */ private static void extractKwds() throws IOException { bkwf = new FileWriter(F00); kwpf = new FileWriter(F01); // Create and open all files fsts = new FileWriter(F02); fstx = new FileWriter(F03); fsnk = new FileWriter(F04); fsnd = new FileWriter(F05); fstp = new DataOutputStream(new FileOutputStream(F06)); kwtm = new FileWriter(F07); kidx = new DataOutputStream(new FileOutputStream(F08)); kttx = new FileWriter(F09); scan(FullPath); // Create the index and write the data into the files if(!abrt) // Write the final pointer (points to fstp.writeInt(FilePtr); // the byte beyond the end of the file) fsnd.close(); // Close all files fsnk.close(); fsts.close(); fstx.close(); fstp.close(); kidx.close(); kwtm.close(); kttx.close(); kwpf.close(); bkwf.close(); } /* Keyword File Sorter. An adaptation C A R Hoare's Quick Sort algorithm. It handles files that are already sorted, and files with duplicate key- words. Note that this program only sorts the elements of the keyword index file "keyword_pointers.tmp". It does NOT sort the actual keywords in "keywords.tmp". Called from only one place in main(). */ private static void sortKwPtrs() throws IOException { /* Open "keyword_pointers.tmp" for random read and write and "keywords.tmp" for read-only. Then sort the pointers into alphabetical order of keywords and close the keywords and keyword_pointers files. */ skif = new RandomAccessFile(F08,"rw"); skwf = new RandomAccessFile(F07,"r"); qs(0,(int)((skif.length() >> 3) - 1)); skwf.close(); skif.close(); } /* HTML KEYWORD FILES BUILDER: keyword_pointers.tmp ooooeeff oooo = 4-byte (32-bit) offset of keyword within "keywords.dat" ee = 2-byte (16-bit) length of keyword within "keywords.dat" ff = 2-byte (16-bit) number within F06 of the HTML file in whose meta tag this occurrence of this keyword was found. keywords.tmp contains variable-length keywords whose offsets and lengths are given in "keyword_pointers.tmp". Note that a given word may appear many times in "keywords.tmp" because an entry is made for each occurrence of each key- word within the meta tags of all HTML files in the web site. Note that the keywords in "keywords.tmp" are NOT in alphabetical order. However, the pointers "oooo" in "keyword_pointers.tmp" point to them in alphabetical order. This program now compacts the above information into another set of files as follows: KEYWORD TEXT POINTERS keyword_pointers.dat pppp pppp = 4-byte (32-bit) offset of a keyword in "keywords.dat" KEY WORD TEXT STREAM keyword_text.dat contains variable-length keywords whose offsets are held in "kwtm.tp". Note that in "keyword_text.dat", each keyword appears once only. It is therefore considerably shorter than "keywords.tmp". RELATIVE FILESPEC POINTERS to REFERENCE NUMBERS filespec_ref_pointers.dat qqqq qqqq = 4-byte (32-bit) offset of the reference number of the HTML file within which this keyword first occurred. RELATIVE FILESPEC REFERENCE NUMBERS filespec_ref_numbers.dat ff ff = 2-byte (16-bit) file number of an HTML file whose meta tag contains a particular keyword. There is an entry in this file for every occurrence of every keyword. Called from only one place in main(). */ private static void makeKwIdxs() throws IOException { DataInputStream kit = new DataInputStream(new FileInputStream("keyword_pointers.tmp")); RandomAccessFile kwt = new RandomAccessFile(F07,"r"); OutputStream kwts = new FileOutputStream(F10); DataOutputStream kwtp = new DataOutputStream(new FileOutputStream(F11)), fsrp = new DataOutputStream(new FileOutputStream(F12)), fsrn = new DataOutputStream(new FileOutputStream(F13)); File f = new File("keyword_pointers.tmp"), g = new File(F07); int // POINTER TO: p = 0, // current keyword in keyword_text.dat q = 0, // current file number in "filespec_ref_numbers.dat" N = (int)f.length() >> 3; // number of 8-byte records in // "keyword_pointers.tmp" String w = ""; // holder for current keyword /* For each entry in "keywords.tmp", get its 8-byte content, write HTML file's reference number to "filespec_ref_numbers.dat". */ for(int n = 0; n < N; n++) { long o = kit.readLong(); fsrn.writeShort((short)(o & 0xffff)); int e = (int)((o >>>= 16) & 0xffff); // extract length of keyword byte B[] = new byte[e]; // create a new byte buffer to receive it kwt.seek(o >>> 16); // move to start of keyword in "keywords.tmp" kwt.read(B); // read the keyword into the byte buffer String s = new String(B); // copy it into a new string, s if(!w.equals(s)) { // If it's a new keyword, then kwts.write(B); // write it to "keyword_text.dat", its kwtp.writeInt(p); // pointer to "keyword_pointers.dat" ant its fsrp.writeInt(q); // file number to "filespec_ref_pointers.dat". p += e; // Shift pointer to byte beyond this keyword in w = s; // "keyword_text.dat" and make new keyword the current one. } q += 2; // advance filenum pointer for "filespec_ref_pointers.dat" } /* Note that these pointers point to the byte beyond the last keyword and the byte beyond the last file number in "keyword_text.dat" and "filespec_ref_numbers.dat" respectively. */ kwtp.writeInt(p); // write final end-pointer to keyword_pointers.dat fsrp.writeInt(q); // write final end-pointer to filespec_ref_numbers.dat kwtp.close(); kwts.close(); fsrp.close(); fsrn.close(); kwt.close(); kit.close(); f.delete(); // delete the temporary files g.delete(); } /* COMPACT ALL FILES INTO A SINGLE INDEX.DAT FILE The search engine index file set comprises: 1. keyword_pointers.dat 2. keyword_text.dat 3. filespec_ref_pointers.dat 4. filespec_ref_numbers.dat 5. filespec_text_pointers.dat 6. F02 filespec text stream file This program joins the above into a single file index.dat. Its full path is ../../index.dat to place it in the website's home directory. */ private static void concat() throws IOException { int x; File f1 = new File(F11); File f2 = new File(F10); File f3 = new File(F12); File f4 = new File(F13); File f5 = new File(F06); File f6 = new File(F02); InputStream kwtp = new FileInputStream(f1); InputStream kwts = new FileInputStream(f2); InputStream fsrp = new FileInputStream(f3); InputStream fsrn = new FileInputStream(f4); InputStream fstp = new FileInputStream(f5); InputStream fsts = new FileInputStream(f6); DataOutputStream indx = new DataOutputStream(new FileOutputStream("index.dat")); /* Extent of keyword pointers (bytes) plus extent of keywords text (bytes) plus extent of pointers to filespec reference numbers (bytes) plus extent of filespec reference numbers plus extent of filespecs text (bytes). */ int a = (int)f1.length(); indx.writeInt(x = a); int b = (int)f2.length(); indx.writeInt(x += b); int c = (int)f3.length(); indx.writeInt(x += c); int d = (int)f4.length(); indx.writeInt(x += d); int e = (int)f5.length(); indx.writeInt(x + e); int f = (int)f6.length(); byte B[]; // writes string as bytes to a file B = new byte[a]; kwtp.read(B); indx.write(B,0,a); B = new byte[b]; kwts.read(B); indx.write(B,0,b); B = new byte[c]; fsrp.read(B); indx.write(B,0,c); B = new byte[d]; fsrn.read(B); indx.write(B,0,d); B = new byte[e]; fstp.read(B); indx.write(B,0,e); B = new byte[f]; fsts.read(B); indx.write(B,0,f); // CLOSE ALL FILES indx.close(); fsts.close(); fstp.close(); fsrn.close(); fsrp.close(); kwts.close(); kwtp.close(); // DELETE ALL EXCEPT INDEX.DAT f1.delete(); f2.delete(); f3.delete(); f4.delete(); f5.delete(); f6.delete(); } public static void main(String args[]) throws IOException { int L = args.length; // number of command-line arguments /* Provided a command line argument has been entered, get the name of the website's base directory from command line. If there is a second command line argument, it is the name of a sub directory of the site which should be added to the search path. */ if(args.length > 0) { BaseDir = args[0]; if(args.length > 1) SubDir = args[1]; } /* If a search sub-directory has been specified, form the full path, otherwise assume indexing starts at the base directory level. */ if(SubDir != "") FullPath = BaseDir + "/" + SubDir; else FullPath = BaseDir; // Form a file object for the directory to be searched. LenDir = FullPath.length() + 1; File pd = new File(FullPath); // Provided that the command line argument is an existing directory if(pd.isDirectory()) { extractKwds(); if(abrt) { System.out.println("More than 64k HTML files."); System.out.println("Modify spider.java for 32-bit"); System.out.println("filespec text pointers."); } else { sortKwPtrs(); // sort the keyword pointers makeKwIdxs(); // generate keyword index files concat(); // Concatenate all index components } // into a single index.dat file. } else System.out.println(FullPath + " is not a directory."); } }