/** * Web Site Link Checker. * @author Robert John Morton YE572246C * @version 03 July 2009 HTML revised version 2018-09-10 11:36:36 */ /* This program checks all the links in all the HTML files in the website. It does not crawl the links themselves. Instead, it verifies that the filepath arguement of each "href=" points to a valid file. This can be any file: not just an HTML file. It includes links to txt, java, c, pdf and other files that can be displayed by some means within or from a web browser. This program creates the following output files: bad_imgs.txt bad_links.txt external_imgs.txt external_links.txt good_imgs.txt good_links.txt malformed_imgs.txt malformed_links.txt hash_imgs.txt hash_links.txt Each contains the full filespec of the HTML file currently being examined with an indented list of relevant links for that file. IMPORTANT: this program requires strict syntax for hyperlink and image links. That is: the equals sign between key and value must be close coupled with no intervening spaces, e.g. 0 8 p Called from only one place in checkLnk(). */ static String extractHyperLink(String Tag) { String E = "MALFORMED: " + Tag; // form the error message if(Tag.startsWith("!--")) // return null if a comment tag return ""; if(Tag.indexOf("a href") == -1) // return null if no return ""; // a "hyperlink" tag if(Tag.indexOf("=\"") != 6) // If the string =" is not found beginning return E; // with character number 6, exit with a // "MALFORMED" eror message. /* An =" has been found at position 6; so now, starting with char- acter number 8, find the position p of the terminating quote. */ int p = Tag.indexOf("\"", 8); if(p < 8) // If the sequence is not found, return E; // the link is malformed. if(Tag.indexOf('#',8) != -1) // If the path contains a # character, hashtag = true; // set the hashtag flag. return Tag.substring(8,p); // return the enclosed filespec } /* TRUNCATION AND JOINING OF THE HTML FILE'S FILESPEC AND CURRENT LINK Example 1: Chopping the "../" off the link: ../../airnav/airnav6.htm#PreCap lp y Example 2: Chopping the same number of levels off the HTML file's filespec /home/rob/website/projects/navigation/wayptapp/geom/wpencpg.htm fp j j j j Called from one place each in checkImg and checkLnk(). */ static boolean validLink(String fp, String lp) { int x, y = 0, z = 1, j = fp.length(); /* Skip past all "../"s to first specified direcrory or file name. 'y' ends up as start position 's' of the rest of the link filespec. */ while((x = lp.indexOf("../", y)) != -1) { y = x + 3; z++; } /* In Example 2, we would go round the following FOR-loop 3 times. The WHILE loop searches backwards for the next '/' character. At the end of the FOR loop, 'j' is the position of '/' at which 'fp' must be truncated. */ for(int i = 0; i < z; i++) while(fp.charAt(--j) != '/'); /* Create a file object from the absolute filespec created by joining the truncated pf and the truncated link address. If pf contains a #, chop it off together with its internal anchor name. */ String s = fp.substring(0, j + 1) + lp.substring(y); int hash = 0; if((hash = s.indexOf('#')) != -1) s = s.substring(0,hash); File f = new File(s); if(f.isFile()) return true; // the link is a valid one return false; // this is a bad link } /* EXTRACT THE URL OR FILESPEC FROM A CAPTURED IMAGE TAG 0 9 p Called from only one place in checkImg(). */ static String extractImgURL(String Tag) { String E = "MALFORMED: " + Tag; // form the error message if(Tag.startsWith("!--")) // return null if a comment tag return ""; if(Tag.indexOf("img src") == -1) // return null if not an "image return ""; // source" tag if(Tag.indexOf("=\"") != 7) // If the string =" is not found begin- return E; // ning with character number 7, exit // with a "MALFORMED" eror message. /* An =" has been found at position 7. We must now find the posi- tion p of the terminating quote, starting with character number 9. If the sequence is not found, the link is malformed. */ int p = Tag.indexOf("\"",9); if(p < 9) return E; Tag = Tag.substring(9,p); // extract the text of the URL or filespec boolean goodURL = false; // assume initially that it is a bad link for(int i = 0; i < FE.length; i++) // Check if the URL ends with one of if(Tag.endsWith(FE[i])) { // the 3 valid file extensions for goodURL = true; // images: png jpg gif. If so, set break; // that it is a good URL } if(goodURL) // if it is a good URL return Tag; // return its text else // otherwise, if it is bad, return E; // return the error message "MALFORMED" } /* CHECK TO SEE IF THE CAPTURED TAG IS A HYPERLINK TO AN IMAGE FILE tag return; // and exit if not. /* If it's an EXTERNAL image, save it in the External Images Array. Else if the link is malformed, save it to the Malformed Images Array. Else if it is a bad link, save it in the Bad Images Array. Otherwise, save it normally in the Good Images Array.*/ if((lp.indexOf("http://") != -1) || (lp.indexOf("https://") != -1)) EI[++iEI] = '\t' + lp; else { if(lp.indexOf("MALFORMED") != -1) MI[++iMI] = '\t' + lp; else { if(!validLink(fp, lp)) BI[++iBI] = '\t' + lp; // invalid URLs else GI[++iGI] = '\t' + lp; // good URLs } } } /* CHECK TO SEE IF THE CAPTURED TAG IS A HYPERLINK TO AN HTML DOCUMENT AND, IF SO, WHETHER IT IS GOOD OR MALFORMED AND WHETHER OR NOT IT HAS A HASH EXTENSION. Called from only one place in checkLinks(). */ static void checkLnk(String Tag, String fp) { String lp = extractHyperLink(Tag); // Check to see if this tag if(lp.equals("")) // is a valid hyperlink return; // tag and exit if not. /* If it's an EXTERNAL link, save it in the External Hyperlinks Array. Else if the link is malformed, save it to the Malformed Links Array. Else if it is a bad link, save it in the Bad Links Array. Otherwise, save it normally in the Good Links Array and if it is a link to an internal # anchor, put it in the #ext links array.*/ if((lp.indexOf("http://") != -1) || (lp.indexOf("https://") != -1)) EL[++iEL] = '\t' + lp; else { if(lp.indexOf("MALFORMED") != -1) ML[++iML] = '\t' + lp; else { if(!validLink(fp, lp) && !lp.startsWith("#")) BL[++iBL] = '\t' + lp; // invalid links else { GL[++iGL] = '\t' + lp; // good links // links containing a #ext for a file's internal anchor points if(hashtag) { FL[++iFL] = '\t' + lp; hashtag = false; } } } } } /* EXAMINE CURRENT HTML FILE FOR BAD LINKS. Called from only one place in scan(). */ static void checkLinks(String fp) { FileReader fr; // file reader for the file to be examined boolean // FLAGS TRUE WHEN GETTING CHARACTERS: inTag = false, // that are part of a tag name inBody = false; // that are part of the file's title int x; // for receiving java character input from file stream char c; // for each character retrieved from the file input stream String Tag = "", // raw tag input string tag = "", // lower-case version of the above Title = ""; // content string try { fr = new FileReader(fp); // create a file reader for this file while((x = fr.read()) != -1) { // loop broken by End-Of-File c = (char)x; // get next character from file stream /* If this new character is the initial HTML tag delimiter "<", we are now deemed to be inside a tag. So set the inTag flag and exit. */ if(c == '<') inTag = true; /* Else, if the new character is the final HTML tag delimiter ">", rationalise the previously captured text of this tag to lower case ready for comparison. */ else if(c == '>') { tag = Tag.toLowerCase(); /* If it is an initial tag, set the inBody flag to indicate that we are now about to receive characters from the area of the HTML document. */ if(tag.indexOf("body") == 0) inBody = true; else if(inBody) { // Otherwise, if already in the area checkLnk(Tag,fp); // of the HTML file, check if the Tag con- checkImg(Tag,fp); // tains a hyperlink. If so, deal with it. } /* Finally, clear the current text content from Tag ready to re- ceive the text of the next tag to be encountered. Then clear the inTag flag to indicate that we are no longer inside an HTML tag. */ Tag = ""; inTag = false; } else if(inTag) // If we are already somewhere inside a tag, add Tag += c; // the newly received character to Tag content. } fr.close(); // close the file reader /* Write an entry for this file into the good links, bad links, external links, malformed links and #ext list files according to whether it contains good, bad, external, malformed or hash links. */ int i = 0; if(iGL > 0) for(i = 0; i < iGL + 1; i++) gudLnks.write(GL[i] + "\n"); if(iBL > 0) for(i = 0; i < iBL + 1; i++) badLnks.write(BL[i] + "\n"); if(iEL > 0) for(i = 0; i < iEL + 1; i++) extLnks.write(EL[i] + "\n"); if(iML > 0) for(i = 0; i < iML + 1; i++) malLnks.write(ML[i] + "\n"); if(iFL > 0) for(i = 0; i < iFL + 1; i++) haxLnks.write(FL[i] + "\n"); if(iGI > 0) for(i = 0; i < iGI + 1; i++) gudImgs.write(GI[i] + "\n"); if(iBI > 0) for(i = 0; i < iBI + 1; i++) badImgs.write(BI[i] + "\n"); if(iEI > 0) for(i = 0; i < iEI + 1; i++) extImgs.write(EI[i] + "\n"); if(iMI > 0) for(i = 0; i < iMI + 1; i++) malImgs.write(MI[i] + "\n"); if(iFI > 0) for(i = 0; i < iFI + 1; i++) haxImgs.write(FI[i] + "\n"); } catch(Exception e) { } // catches the 'end-of-file' exception return; } /* This method is re-entrant. It calls itself. When invoked, it lists the files and directories contained within the directory 'dir' passed to it as its parameter. It then examines each entry in that directory. If an entry is an HTML file, it writes that file's relative filespec to the A[] array. The 'relative' filespec is the path+filename from the point of view of the parent directory. If an entry is a directory, it simply calls itself to deal with that (sub) directory as it is doing with the current directory. Thus it can handle any depth of sub-directories. Called only by itself and from one place in main(). */ private static void scan(String d) throws IOException { File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all items in this directory /* For each html file in the sub-directory, get relative path name of next sub-directory or file and with it form full path name of [next] sub-directory. From this create a file object for it. */ for(int i = 0; i < D.length; i++) { String dd = D[i], // Note: dd is used separately later fp = d + "/" + dd; File fs = new File(fp); if(fs.isDirectory() // If the file object be a directory, && !dd.equals("images") //and it is NOT one of these: && !dd.equals("webtools") && !dd.equals("articles-pdf") && !dd.equals("applets") && !dd.equals("java_progs")) { dir_level++; // Increment directory depth index by one and re-enter scan(fp); // this method. Upon return, decrement directory depth dir_level--; // index by one to bring us back to the original level. } /* Otherwise, if the file object is an actual file with an ".html" extension, [Note: indexOf() is used because the file name may have a #extension to address an internal achor point within the file.] */ else if(fs.isFile() && fp.indexOf(".html") != -1) { GL[0] = '\n' + fp; BL[0] = '\n' + fp; // Put the article's full filespec in the EL[0] = '\n' + fp; // Good Links, Bad Links, External Links, ML[0] = '\n' + fp; // Malformed Links and '#ext' arrays. FL[0] = '\n' + fp; GI[0] = '\n' + fp; // also for an image BI[0] = '\n' + fp; EI[0] = '\n' + fp; MI[0] = '\n' + fp; FI[0] = '\n' + fp; // initialize all indexes for both hyperlinks and images iGL = 0; iBL = 0; iEL = 0; iML = 0; iFL = 0; iGI = 0; iBI = 0; iEI = 0; iMI = 0; iFI = 0; checkLinks(fp); // go examine the hyperlinks in this HTML file } } } public static void main(String args[]) throws IOException { String bd = "../.."; // default base directory if(args.length > 0) // If a directory were specifed on the comm- bd = args[0]; // and line, use it instead of the default. dl = bd.length() + 1; // length of parent directory path name + 1 File pd = new File(bd); // form file object for parent directory /* If command line argument is an existing directory, open the necessay output streams for the following files. */ if(pd.isDirectory()) { gudLnks = new FileWriter("good_links.txt"); badLnks = new FileWriter("bad_links.txt"); extLnks = new FileWriter("external_links.txt"); malLnks = new FileWriter("malformed_links.txt"); haxLnks = new FileWriter("hash_links.txt"); gudImgs = new FileWriter("good_imgs.txt"); badImgs = new FileWriter("bad_imgs.txt"); extImgs = new FileWriter("external_imgs.txt"); malImgs = new FileWriter("malformed_imgs.txt"); haxImgs = new FileWriter("hash_imgs.txt"); scan(bd); // create the file list arrays // Close all the files when finished. gudLnks.close(); badLnks.close(); extLnks.close(); malLnks.close(); haxLnks.close(); gudImgs.close(); badImgs.close(); extImgs.close(); malImgs.close(); haxImgs.close(); } else System.out.println( bd + " is not a directory." ); } }