/** * Chapter Indexer * @author Robert John Morton * @version 02 July 2009 */ /* This program does NOT generate the java search engine index. Look in the folder 'spider' for this. This program creates a file index??.htm containing a list of hyperlinked titles + description text for every chapter and article in my website-based book The Lost Inheritance. The ?? in index??.htm stands for chapter numbers in the range 01 to 13. Optional 3rd argument E causes entries without a description to be excluded. This program excludes files with a tag from articles_index.htm and instead writes them to the file called articles_noindex.htm. Sample command lines: create the index01.htm file for all articles in Chapter 1: java chapter_indexer home/rob/Private/website_css/book 01 E create all index??.htm files for all chapters (01 to 13): java chapter_indexer home/rob/Private/website_css/book 00 */ import java.io.*; class chapter_indexer { static int dir_level = 0, // current directory level used in scan() dl, // length of parent directory path name + terminating '/' arts = 0; // chapter+articles files counter static Writer fslist, // for file 'index??.htm' for chapter indexes frlist, // for file 'noindex??.htm' for rejects indexes chcont; // for file 'noindex??.htm' for rejects indexes static String A[] = new String[4000], // String array for the articles' filespecs T[] = new String[4000], // String array for the articles' titles ST[] = new String[4000], // String array for the articles' short titles Q[] = new String[4000], // String array for the articles' descriptions Descr = "", // for HTML file description shortTitle = "", // short title of HTML file sd = "", // chapter number CH = "", CHAPPY = ""; static boolean ind = true, // include entries with no descriptions GotDescription = false, GotTheNoIndexTag = false, GotShortTitle = false, B[] = new boolean[4000]; // array of noindex flags /* DETERMINE WHETHER OR NOT THE PRESENTED TAG IS A DESCRIPTION META TAG Called from only one place in HTMLtitle(). */ static void isDescriptionTag(String Tag) { if(GotDescription) return; /* If the tag text text begins with the word "meta" and it contains the word "description" then find the position of the first letter of the word "content". */ if((Tag.indexOf("meta") == 0) && (Tag.indexOf("description") != -1)) { int x = Tag.indexOf("content"); /* Provided the word "content" was found and located, find the posi- tions of the opening quote mark and of the closing quote mark. */ if(x != -1) { x = Tag.indexOf('\"', x + 7) + 1; int y = Tag.indexOf('\"', x); /* Provided some actual text exists between the quote marks, put it in the Descr string and set the "got description" flag to TRUE. */ if(y > x) { Descr = Tag.substring(x, y); GotDescription = true; } } } } /* TEST TO SEE IF THE CAPTURED TAG CONTENT IS A "noindex" TAG. Called from only one place in HTMLtitle(). */ static void isNoIndexTag(String Tag) { if(GotTheNoIndexTag) return; // if we already know that it is, then exit /* If it begins with the word "meta" and it contains the word "robots" and the word "noindex" then it is a "noindex" tag, so set flag = true and exit. */ if((Tag.indexOf("meta") == 0) && (Tag.indexOf("robots") != -1) && (Tag.indexOf("noindex") != -1)) GotTheNoIndexTag = true; return; } /* Test to see if the captured tag content is a short title tag of the form . Called from only one place in HTMLtitle(). */ static void isShortTitleTag(String Tag) { if(GotShortTitle) return; if(!((Tag.startsWith("!--")) && (Tag.endsWith("--")))) return; if(Tag.equals("")) shortTitle = "NULL"; else { int l = Tag.length(); if(l < 25) shortTitle = Tag.substring(3,l - 2); GotShortTitle = true; } } /* EXAMINE THE CONTENTS OF THE HTML FILE Called from one place in scan(). */ static String HTMLtitle(String fp) { FileReader fr; // file reader for the file to be examined boolean // These flags are true when getting inTag = false, // characters that are part of a tag name inTitle = false; // characters that are part of the file's title int x; // for receiving java UNICODE character input from file stream char c; // for each character retrieved from the file input stream String Tag = "", // raw tag input string Title = ""; // title content string Descr = ""; // clear the description string GotDescription = false; GotTheNoIndexTag = false; GotShortTitle = false; try { fr = new FileReader(fp); // create a file reader for this file while((x = fr.read()) != -1) { // loop broken by End-Of-File c = (char)x; // get the next character from the file stream if(c == '<') // If initial tag-delimiter encountered, inTag = true; // we are inside a tag, so exit. /* If a tag termination delimiter is encountered, rationalise tag text to lower case for comparison. */ else if(c == '>') { String tag = Tag.toLowerCase(); /* If it is an initial title tag