/** * HTML Tag Checker * @author Robert J Morton * @version 10 July 2018 * @copyright Sep 2018 Robert J Morton (all rights reserved) */ /* This program looks for space-padded equals signs in HTML tags. It scans each HTML file in the website, capturing all tags that have equals signs between name and quoted content. Any such tags encountered that contain space-padded equals signs are listed, under the name of the file in which they occur, in the output file 'naked_equals_signs.txt'. */ import java.io.*; class naked_equals { private static Writer ne; // to capture files containing '= ', ' = or ' = ' in an HTML tag private static FileReader fr; // for each HTML file scanned private static String bd = "", // first command line arguement [parent file path] sd = "", // 2nd command line arguement [directory to be word-counted] fp; // full path to HTML file being examined for tag-pair errors private static boolean E = false; // true: a naked '=' has been detected inside an HTML tag /* This method looks for any '=' sign that is either preceded or followed by a space [i.e. ' =', '= ' or ' = '] within any non-comment HTML tag ex- cluding any quoted strings which may be present within the tag. Method is called from only one place in scan(). */ static void tagCapture() { int a = 0, // 3rd previous character retrieved from the curent HTML file b = 0, // 2nd previous character retrieved from the curent HTML file c = 0, // 1st previous character retrieved from the curent HTML file d = 0; // current character retrieved from the curent HTML file boolean C = false, // true: we're inside an HTML comment T = false, // true: we're inside a normal HTML tag Q = false; // true: we're inside a quoted string within an HTML tag E = false; // reset the naked equals detector flag try { fr = new FileReader(fp); // create a new file reader /* Read and examine each each character in turn from the current HTML file until the end-of-file is encountered. */ while((d = fr.read()) != -1) { if(T & !Q) // if we're currently inside an HTML tag // but not inside a quoted string: /* If the current character is an '=' and the prev- ious character is a space or the current character is a space and the previous character is an '=': */ if(d == '=' && c == ' ' || d == ' ' && c == '=') { E = true; // an invalid naked equals sign has been en- break; // countered so we've finished with this HTML } // file so break out of the while() loop. // If a '' sequence has been captured: if(b == '-' && c == '-' && d == '>') C = false; // we've now exited an HTML comment. else // If a '<' followed by a lower case alpha has been captured: if(c == '<' && d >= 'a' && d <= 'z') T = true; // we've now entered a normal HTML tag. else /* A tag name can end with a lower-case alpha, a numeric e.g. or a double quote mark: but not a space. So, if a '>' that is preceded by a lower case alpha has been captured: */ if(d == '>' && c != ' ') T = false; // we've now exited an HTML tag. else if(d == '\"') // if the current character is a quotation mark if(T) // then if we're inside an HTML tag Q = !Q; // invert the in/out of quotes condition else // but if we're outside a tag, we're also necces- Q = false; // sarily outside a quoted string within a tag a = b; // Move the 3 most recent characters one b = c; // place along the captured sequence ready c = d; // for the next pass of the while() loop. } // end of while() loop fr.close(); // close the file reader } catch(Exception e) { } // catches end-of-file exception } /* When invoked, it examines the files and directories contained within the directory 'd' passed to it as its parameter. If an entry is an HTML file, it calls the tagCapture method below. The 'relative' filespec is the path +filename from the point of view of the current directory. If an entry is a directory, it simply calls itself to deal with that (sub) directory as it is doing with the current directory. Thus it can handle any depth of sub-directories from the parent. This method is called by itself and from one place in main(). */ private static void scan(String d) throws IOException { char ch = ' '; File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all items in this directory /* For each HTML file in the sub-directory, get name of [next] sub-directory and create a file object for it. */ for(int i = 0; i < D.length; i++) { fp = d + "/" + D[i]; // full path to HTML file being examined File fs = new File(fp); /* Provided it is an existing directory and it isn't a development directory, then re-enter this method. */ if(fs.isDirectory() && !fp.endsWith("webtools") && !fp.endsWith("images") && !fp.endsWith("applets") && !fp.endsWith("java_progs") && !fp.endsWith("C-programs")) scan(fp); /* Else it should be a file. So, if the file exists and it is an HTML file, call the tagCapture() method below. */ else if(fs.isFile() && fp.endsWith(".html")) { tagCapture(); if(E) // if a space-padded ' = ' has been found ne.write(fp + '\n'); // write name of file in which it occurred } } // end of the for() loop } public static void main(String args[]) throws Exception { /* Provided at least the two mandatory command line arguements have been entered, set the first arguement as the name of the given document base directory and the second arguement as the sub-directory to be word-counted. */ bd = "/home/rob/Private"; sd = "website"; String d = bd + "/" + sd; // form the full path File pd = new File(d); // create a file object from the full path /* If command line argument is an existing directory, open the output file writer in order to be able to use write() */ if(pd.isDirectory()) { ne = new FileWriter("naked_equals.txt"); scan(d); // scan for HTML files in the specified directory tree ne.close(); } else System.out.println(d + " is not a directory."); } }