/** * List the names of HTML files which contain one or more keywords that do not appear as such within its body text. * @author Robert John Morton * @version 29 July 2019 */ /* COMMANDS: cd /home/rob/Private/website/webtools/keywords-checker/ java keywordsNotInBodyText Creates the file keywordsNotInBodyText.txt */ import java.io.*; // for file input/output handling class keywordsNotInBodyText { private static Writer kr; // for keywords that don't appear in body text private static int dir_level = 0, // current directory level used in scan() LenDir = 0, // length of parent directory path + a terminating '/' nk = 0, // number of keywords in the current HTML file nf = 0; // total number of files processed private static boolean // SET TO TRUE WHEN: haveKeywords = false, // current HTML file has keywords inBody = false; // true = currently getting body text private static String /* The names of all the top-level directories in the website that must be indexed. There are other top-level directories which are excluded from the indexing process for one reason or another. */ FOLDERS[] = {"book","chaos","computers","home","internet","landshare", "navigation","poems","radio","science","software"}, BaseDir = "/home/rob/Private/website", // default base directory KW[] = new String[20], // keywords of the current HTML file Tag = "", // text of current meta tag, tag = "", // plus lower-case version SubDir = "", // sub-directory FullPath = "", // absolute path to directory currently being indexed RelFileSpec = "", // relative filespec of current HTML file Keywords = "", // current file's comma-delimited keywords bodyText = ""; // for the body text of the current HTML file /* CHECK THE VALIDITY OF THE KEYWORD TAG. EXAMPLES: String 'tag' contains the whole keywords tag. Called from only one place in examineThisHTML(). */ private static boolean isValidKeywordsTag() { int p, // start pointer for chopping string 't' q; // end pointer for chopping string 't' if (((p = tag.indexOf("meta")) == -1) // if not a meta tag || ((p = tag.indexOf("name", p + 4)) == -1) // or has no name || ((p = tag.indexOf('=', p + 4)) == -1) // or has no name= sign // or if it is not a keywords tag || ((p = tag.indexOf("keywords", p + 1)) == -1) // or it has no "contents" parameter || ((p = tag.indexOf("content", p + 8)) == -1) || ((p = tag.indexOf('=', p + 7)) == -1) // or has no content= sign || ((p = tag.indexOf('\"', p + 1)) == -1) // or has no opening quote || ((q = tag.indexOf('\"', p + 1)) == -1)) // or has no closing quote return false; // it is not a valid content of a keywords tag Keywords = tag.substring(p + 1, q); // extract keywords if(Keywords.equals("")) // if no keywords specified, return false; // exit. return true; // it is a valid keywords tag } /* EXAMINE THE CONTENTS OF THE HTML FILE'S SECTION FOR KEYWORDS A META TAG AND THEN THE SECTION TO GET THE THE BODY TEXT Called from only one place in scan(). */ private static void examineThisHTMLfile(File fp) { /* The following flags are respectively TRUE when inputting the contents of an HTML tag, when inputting the body text, if and when the keywords have been successfully extracted from the keywords meta tag. */ boolean inTag = false; inBody = false; haveKeywords = false; Keywords = ""; // clear the keywords of the previous HTML file bodyText = ""; // clear the body text of the previous HTML file int x, // for receiving java character input from file stream inCharent = 0; // for cutting out the '' soft hyphens char c; // for each character retrieved from file input stream Tag = ""; // raw tag input string try { FileReader fr = new FileReader(fp); // open this HTML file while((x = fr.read()) != -1) { // loop broken by End-Of-File c = (char)x; // get next character if(inBody) { // if currently within the file's text ... /* Cut out any "' soft hyphen character which may appear within a potential keyword string within the body text. */ if(inCharent == 0 && c == 38) inCharent = 1; // & else if(inCharent == 1 && c == 115) inCharent = 2; // s else if(inCharent == 2 && c == 104) inCharent = 3; // h else if(inCharent == 3 && c == 121) inCharent = 4; // y else if(inCharent == 4 && c == 59) inCharent = 0; // ; /* Else, if the character be a capital letter A to Z then add 32 to make it lower case. Then, if 'c' be a lower case letter or a space, add it to the body-text string bodyText. */ else { inCharent = 0; // in case shy characters aren't consecutive if(c > 64 && c < 91) c += 32; if(c > 96 && c < 123 || c == 32) bodyText += c; } } /* Get the first/next character from the file stream. If it is an initial tag-delimiter '<', set the inTag flag to show that we are, from now on, inside an HTML tag. Then, in effect, we loop back to get the next character. */ else if(c == '<') inTag = true; /* If the character we just pulled is a final tag-delimiter '>', rationalise tag text to lower case ready for comparison. Then find out what kind of tag it is. */ else if(c == '>') { inTag = false; tag = Tag.toLowerCase(); /* If the 'haveKeywords' tag gets set, it means that we've found a valid meta tag that actually contains keywords. If it is not set, it could mean that a valid keyword meta tag was found but contained no keywords, a malformed keyword meta tag was encoun- tered, or no keyword meta tag was found at all. */ if(!haveKeywords) { if(isValidKeywordsTag()) haveKeywords = true; } /* Else we are still looking for the tag so check to see if it has been encountered yet. If so, set the inBody flag so to start loading the body text and not pass this way again. */ else if(tag.equals("/head")) inBody = true; Tag = ""; // Clear and reset ready for the inTag = false; // next tag to be encountered } /* Else, if currently inside a tag, simply add the current character to the tag name */ else if(inTag) Tag += c; } // End of while() fr.close(); // close the file reader } catch(Exception e) { } // catches end-of-file exception } /* ONCE THE KEYWORDS AND BODY TEXT HAVE BEEN ACQUIRED FROM THE CURRENT HTML FILE, REMOVE ACCENTS FROM ALL ACCENTED CHARACTERS, PUT EACH KEYWORD INTO A SEPARATE ELEMENT OF THE KW[] STRING ARRAY, CHECK WHETHER OR NOT EACH KEYWORD APPEARS WITHIN THE BODY TEXT AND OUTPUT ALL UNFOUND KEYWORDS TO THE REPORT FILE. Called from only one place in scan(). */ private static void checkThisHTMLfile(String fp) throws IOException { if(haveKeywords) { /* Extract this HTML file's keywords from the 'Keywordfs' string, trim each in turn and store it in the string array KW[]. Note that the last keyword has to be handled separately because it is not terminated by a comma like the others. */ boolean B = false, // path of HTML file not yet written to report file keywordsNotFound = false; String t; // for a single keyword int x = 0, // pointer to start of a single keyword y = 0; // pointer to finish of a single keyword // Remove accents. See comment in spider.java Keywords = Keywords.replaceAll("[èéêë]","e"); Keywords = Keywords.replaceAll("[ûùúü]","u"); Keywords = Keywords.replaceAll("[ïîíì]","i"); Keywords = Keywords.replaceAll("[àáâã]","a"); Keywords = Keywords.replaceAll("[ôóòõ]","o"); Keywords = Keywords.replaceAll("[ç]","c"); /* While we can still find a further comma, extract and trim the next individual keyword and store it in the array KW[]. Increment the number of retrieved keywords. NOTE: trim() removes all types of whitespace characters, including space, newline, tab, etc. So don't worry if there are odd line-breaks within the comma-delimited key- words list inside the keywords meta tag. */ nk = 0; while((y = Keywords.indexOf(",", x)) != -1) { KW[nk++] = (Keywords.substring(x, y)).trim(); x = y + 1; } /* Extract, trim and store the final individual keyword, which has no terminating comma. and store it in KW[]. Add final keyword to the number of keywords in this HTML file. */ KW[nk++] = (Keywords.substring(x, Keywords.length())).trim(); /* For each keyword in the current HTML file, if it is not found within the body text of the HTML file's body text, write it to the report file. Insert the current HTML file's path name on the line immedi- ately before the first unfound keyword so that if all keywords are found the HTML file's path name does not appear in the report. */ for(int i = 0; i < nk; i++) { if(Keywords.indexOf(KW[i]) == -1) { keywordsNotFound = true; if(!B) { kr.write(fp + '\n'); B = true; } kr.write(KW[i] + ", "); } } if(keywordsNotFound) kr.write("\nTotal keywords in this HTML file = " + nk + "\n\n"); } else { kr.write(fp + '\n'); kr.write("No keywords were found in this HTML file\n\n"); } } /* This method is re-entrant. It calls itself. When invoked, it lists the files and directories contained within the directory 'FullPath' passed to it as its parameter. It then examines each entry in that directory. If an entry is a relevant HTML file, it extracts the file's keywords and body text and checks to see that each keyword exists within the body text. If not, the file and those of its keywords that do not appear in its body text are written to the report file. If an entry is a directory, this method calls itself to deal with that (sub) directory as it is doing with the current directory. Thus it can handle any depth of sub-directories from the parent. /home/rob/Private/website/book/chap01/chap01/chap01_frame.htm |----------------l-----------------| LenDir s.length() Called from only one place in main(). */ private static void scan(String d) throws IOException { File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all files and sub-directories in this // directory // for each HTML file in the CURRENT sub-directory for(int i = 0; i < D.length; i++) { String // FOR THE [NEXT] FILE OR SUB-DIRECTORY, CREATE: dd = D[i], // a string to contain its relative path name fp = d + "/" + dd; // a string to contain its file-path File fo = new File(fp); // create a file object for it if(fo.isDirectory()) { // if the object is a directory /* FALSE indicates that this is NOT one of the directories to be indexed. */ boolean flag = false; /* If we are in the top-level directory of the website THEN if the ith sub-directory is one of those to be indexed, set the flag to indicate that this sub-directory must be indexed, else we cannot be in the top-level directory, so set the flag to indicate this and go index the subdirectory anyway. */ if(dir_level == 0) { for(int j = 0; j < FOLDERS.length; j++) if(dd.equals(FOLDERS[j])) { flag = true; break; } } else flag = true; /* Provided this is one of the directories to be indexed and it is not an images, applets or java_progs directory, then increment the directory level from the one we are now in, re-enter this method, then on return, decrement the directory level back to this one. */ if(flag && (dd.indexOf("images") == -1) && (dd.indexOf("applets") == -1) && (dd.indexOf("java_progs") == -1)) { dir_level++; scan(fp); dir_level--; } } /* On the other hand, if it is not a directory but is an existing HTML file and we are not currently in the top-level directory, then scan this HTML file for meta tags and extract its keywords from the keywords meta tag. */ else if(fo.isFile() && fp.endsWith(".html") && !fp.endsWith("index.html") && (dir_level > 0) ) { examineThisHTMLfile(fo); checkThisHTMLfile(fp); nf++; // increment number of files processed } } } public static void main(String args[]) throws IOException { int L = args.length; // number of command-line arguments /* Provided a command line argument has been entered, get the name of the website's base directory from command line. If there is a second command line argument, it is the name of a sub directory of the site which should be added to the search path. */ if(args.length > 0) { BaseDir = args[0]; if(args.length > 1) SubDir = args[1]; } /* If a search sub-directory has been specified, form the full path, otherwise assume indexing starts at the base directory level. */ if(SubDir != "") FullPath = BaseDir + "/" + SubDir; else FullPath = BaseDir; /* Form a file object for the directory to be searched. Then, provided that the command line argument is an existing directory, create the index and write the data into the report file then close it. */ LenDir = FullPath.length() + 1; File pd = new File(FullPath); if(pd.isDirectory()) { kr = new FileWriter("keywordsNotInBodyText.txt"); scan(FullPath); kr.write("Total number of files processed: " + nf + '\n'); kr.close(); } else System.out.println(FullPath + " is not a directory."); } }