/**
* Web Site Link Checker.
* @author Robert John Morton YE572246C
* @version 03 July 2009 HTML revised version 2018-09-10 11:36:36 */
/* This program checks all the links in all the HTML files in
the website. It does not crawl the links themselves. Instead, it verifies that
the filepath arguement of each "href=" points to a valid file. This can be any
file: not just an HTML file. It includes links to txt, java, c, pdf and other
files that can be displayed by some means within or from a web browser.
This program creates the following output files:
bad_imgs.txt
bad_links.txt
external_imgs.txt
external_links.txt
good_imgs.txt
good_links.txt
malformed_imgs.txt
malformed_links.txt
hash_imgs.txt
hash_links.txt
Each contains the full filespec of the HTML file currently being examined
with an indented list of relevant links for that file.
IMPORTANT: this program requires strict syntax for hyperlink and image links.
That is: the equals sign between key and value must be close coupled with no
intervening spaces, e.g.
0 8 p
Called from only one place in checkLnk(). */
static String extractHyperLink(String Tag) {
String E = "MALFORMED: " + Tag; // form the error message
if(Tag.startsWith("!--")) // return null if a comment tag
return "";
if(Tag.indexOf("a href") == -1) // return null if no
return ""; // a "hyperlink" tag
if(Tag.indexOf("=\"") != 6) // If the string =" is not found beginning
return E; // with character number 6, exit with a
// "MALFORMED" eror message.
/* An =" has been found at position 6; so now, starting with char-
acter number 8, find the position p of the terminating quote. */
int p = Tag.indexOf("\"", 8);
if(p < 8) // If the sequence is not found,
return E; // the link is malformed.
if(Tag.indexOf('#',8) != -1) // If the path contains a # character,
hashtag = true; // set the hashtag flag.
return Tag.substring(8,p); // return the enclosed filespec
}
/* TRUNCATION AND JOINING OF THE HTML FILE'S FILESPEC AND CURRENT LINK
Example 1:
Chopping the "../" off the link: ../../airnav/airnav6.htm#PreCap lp
y
Example 2:
Chopping the same number of levels off the HTML file's filespec
/home/rob/website/projects/navigation/wayptapp/geom/wpencpg.htm fp
j j j j
Called from one place each in checkImg and checkLnk(). */
static boolean validLink(String fp, String lp) {
int
x,
y = 0,
z = 1,
j = fp.length();
/* Skip past all "../"s to first specified direcrory or file name. 'y'
ends up as start position 's' of the rest of the link filespec. */
while((x = lp.indexOf("../", y)) != -1) {
y = x + 3;
z++;
}
/* In Example 2, we would go round the following FOR-loop 3 times.
The WHILE loop searches backwards for the next '/' character.
At the end of the FOR loop, 'j' is the position of '/' at which
'fp' must be truncated. */
for(int i = 0; i < z; i++)
while(fp.charAt(--j) != '/');
/* Create a file object from the absolute filespec created by joining
the truncated pf and the truncated link address. If pf contains a #,
chop it off together with its internal anchor name. */
String s = fp.substring(0, j + 1) + lp.substring(y);
int hash = 0;
if((hash = s.indexOf('#')) != -1)
s = s.substring(0,hash);
File f = new File(s);
if(f.isFile())
return true; // the link is a valid one
return false; // this is a bad link
}
/* EXTRACT THE URL OR FILESPEC FROM A CAPTURED IMAGE TAG
0 9 p
Called from only one place in checkImg(). */
static String extractImgURL(String Tag) {
String E = "MALFORMED: " + Tag; // form the error message
if(Tag.startsWith("!--")) // return null if a comment tag
return "";
if(Tag.indexOf("img src") == -1) // return null if not an "image
return ""; // source" tag
if(Tag.indexOf("=\"") != 7) // If the string =" is not found begin-
return E; // ning with character number 7, exit
// with a "MALFORMED" eror message.
/* An =" has been found at position 7. We must now find the posi-
tion p of the terminating quote, starting with character number 9.
If the sequence is not found, the link is malformed. */
int p = Tag.indexOf("\"",9);
if(p < 9)
return E;
Tag = Tag.substring(9,p); // extract the text of the URL or filespec
boolean goodURL = false; // assume initially that it is a bad link
for(int i = 0; i < FE.length; i++) // Check if the URL ends with one of
if(Tag.endsWith(FE[i])) { // the 3 valid file extensions for
goodURL = true; // images: png jpg gif. If so, set
break; // that it is a good URL
}
if(goodURL) // if it is a good URL
return Tag; // return its text
else // otherwise, if it is bad,
return E; // return the error message "MALFORMED"
}
/* CHECK TO SEE IF THE CAPTURED TAG IS A HYPERLINK TO AN IMAGE FILE
tag
return; // and exit if not.
/* If it's an EXTERNAL image, save it in the External Images Array.
Else if the link is malformed, save it to the Malformed Images Array.
Else if it is a bad link, save it in the Bad Images Array. Otherwise,
save it normally in the Good Images Array.*/
if((lp.indexOf("http://") != -1) || (lp.indexOf("https://") != -1))
EI[++iEI] = '\t' + lp;
else {
if(lp.indexOf("MALFORMED") != -1)
MI[++iMI] = '\t' + lp;
else {
if(!validLink(fp, lp))
BI[++iBI] = '\t' + lp; // invalid URLs
else
GI[++iGI] = '\t' + lp; // good URLs
}
}
}
/* CHECK TO SEE IF THE CAPTURED TAG IS A HYPERLINK TO AN HTML DOCUMENT
AND, IF SO, WHETHER IT IS GOOD OR MALFORMED AND WHETHER OR NOT IT HAS
A HASH EXTENSION. Called from only one place in checkLinks(). */
static void checkLnk(String Tag, String fp) {
String lp = extractHyperLink(Tag); // Check to see if this tag
if(lp.equals("")) // is a valid hyperlink
return; // tag and exit if not.
/* If it's an EXTERNAL link, save it in the External Hyperlinks Array.
Else if the link is malformed, save it to the Malformed Links Array.
Else if it is a bad link, save it in the Bad Links Array. Otherwise,
save it normally in the Good Links Array and if it is a link to an
internal # anchor, put it in the #ext links array.*/
if((lp.indexOf("http://") != -1) || (lp.indexOf("https://") != -1))
EL[++iEL] = '\t' + lp;
else {
if(lp.indexOf("MALFORMED") != -1)
ML[++iML] = '\t' + lp;
else {
if(!validLink(fp, lp) && !lp.startsWith("#"))
BL[++iBL] = '\t' + lp; // invalid links
else {
GL[++iGL] = '\t' + lp; // good links
// links containing a #ext for a file's internal anchor points
if(hashtag) {
FL[++iFL] = '\t' + lp;
hashtag = false;
}
}
}
}
}
/* EXAMINE CURRENT HTML FILE FOR BAD LINKS.
Called from only one place in scan(). */
static void checkLinks(String fp) {
FileReader fr; // file reader for the file to be examined
boolean // FLAGS TRUE WHEN GETTING CHARACTERS:
inTag = false, // that are part of a tag name
inBody = false; // that are part of the file's title
int x; // for receiving java character input from file stream
char c; // for each character retrieved from the file input stream
String
Tag = "", // raw tag input string
tag = "", // lower-case version of the above
Title = ""; // content string
try {
fr = new FileReader(fp); // create a file reader for this file
while((x = fr.read()) != -1) { // loop broken by End-Of-File
c = (char)x; // get next character from file stream
/* If this new character is the initial HTML tag
delimiter "<", we are now deemed to be inside a tag.
So set the inTag flag and exit. */
if(c == '<')
inTag = true;
/* Else, if the new character is the final HTML tag delimiter
">", rationalise the previously captured text of this tag
to lower case ready for comparison. */
else if(c == '>') {
tag = Tag.toLowerCase();
/* If it is an initial tag, set the inBody flag
to indicate that we are now about to receive characters
from the area of the HTML document. */
if(tag.indexOf("body") == 0)
inBody = true;
else if(inBody) { // Otherwise, if already in the area
checkLnk(Tag,fp); // of the HTML file, check if the Tag con-
checkImg(Tag,fp); // tains a hyperlink. If so, deal with it.
}
/* Finally, clear the current text content from Tag ready to re-
ceive the text of the next tag to be encountered. Then clear the
inTag flag to indicate that we are no longer inside an HTML tag. */
Tag = "";
inTag = false;
}
else if(inTag) // If we are already somewhere inside a tag, add
Tag += c; // the newly received character to Tag content.
}
fr.close(); // close the file reader
/* Write an entry for this file into the good links, bad links,
external links, malformed links and #ext list files according to
whether it contains good, bad, external, malformed or hash links. */
int i = 0;
if(iGL > 0) for(i = 0; i < iGL + 1; i++) gudLnks.write(GL[i] + "\n");
if(iBL > 0) for(i = 0; i < iBL + 1; i++) badLnks.write(BL[i] + "\n");
if(iEL > 0) for(i = 0; i < iEL + 1; i++) extLnks.write(EL[i] + "\n");
if(iML > 0) for(i = 0; i < iML + 1; i++) malLnks.write(ML[i] + "\n");
if(iFL > 0) for(i = 0; i < iFL + 1; i++) haxLnks.write(FL[i] + "\n");
if(iGI > 0) for(i = 0; i < iGI + 1; i++) gudImgs.write(GI[i] + "\n");
if(iBI > 0) for(i = 0; i < iBI + 1; i++) badImgs.write(BI[i] + "\n");
if(iEI > 0) for(i = 0; i < iEI + 1; i++) extImgs.write(EI[i] + "\n");
if(iMI > 0) for(i = 0; i < iMI + 1; i++) malImgs.write(MI[i] + "\n");
if(iFI > 0) for(i = 0; i < iFI + 1; i++) haxImgs.write(FI[i] + "\n");
} catch(Exception e) { } // catches the 'end-of-file' exception
return;
}
/* This method is re-entrant. It calls itself. When invoked, it lists the
files and directories contained within the directory 'dir' passed to
it as its parameter. It then examines each entry in that directory. If
an entry is an HTML file, it writes that file's relative filespec to the
A[] array. The 'relative' filespec is the path+filename from the point of
view of the parent directory. If an entry is a directory, it simply calls
itself to deal with that (sub) directory as it is doing with the current
directory. Thus it can handle any depth of sub-directories.
Called only by itself and from one place in main(). */
private static void scan(String d) throws IOException {
File fd = new File(d); // create file object for given directory name
String D[] = fd.list(); // list all items in this directory
/* For each html file in the sub-directory, get relative path name
of next sub-directory or file and with it form full path name of
[next] sub-directory. From this create a file object for it. */
for(int i = 0; i < D.length; i++) {
String
dd = D[i], // Note: dd is used separately later
fp = d + "/" + dd;
File fs = new File(fp);
if(fs.isDirectory() // If the file object be a directory,
&& !dd.equals("images") //and it is NOT one of these:
&& !dd.equals("webtools")
&& !dd.equals("articles-pdf")
&& !dd.equals("applets")
&& !dd.equals("java_progs")) {
dir_level++; // Increment directory depth index by one and re-enter
scan(fp); // this method. Upon return, decrement directory depth
dir_level--; // index by one to bring us back to the original level.
}
/* Otherwise, if the file object is an actual file with an ".html"
extension, [Note: indexOf() is used because the file name may have
a #extension to address an internal achor point within the file.] */
else if(fs.isFile() && fp.indexOf(".html") != -1) {
GL[0] = '\n' + fp;
BL[0] = '\n' + fp; // Put the article's full filespec in the
EL[0] = '\n' + fp; // Good Links, Bad Links, External Links,
ML[0] = '\n' + fp; // Malformed Links and '#ext' arrays.
FL[0] = '\n' + fp;
GI[0] = '\n' + fp; // also for an image
BI[0] = '\n' + fp;
EI[0] = '\n' + fp;
MI[0] = '\n' + fp;
FI[0] = '\n' + fp;
// initialize all indexes for both hyperlinks and images
iGL = 0; iBL = 0; iEL = 0; iML = 0; iFL = 0;
iGI = 0; iBI = 0; iEI = 0; iMI = 0; iFI = 0;
checkLinks(fp); // go examine the hyperlinks in this HTML file
}
}
}
public static void main(String args[]) throws IOException {
String bd = "../.."; // default base directory
if(args.length > 0) // If a directory were specifed on the comm-
bd = args[0]; // and line, use it instead of the default.
dl = bd.length() + 1; // length of parent directory path name + 1
File pd = new File(bd); // form file object for parent directory
/* If command line argument is an existing directory, open
the necessay output streams for the following files. */
if(pd.isDirectory()) {
gudLnks = new FileWriter("good_links.txt");
badLnks = new FileWriter("bad_links.txt");
extLnks = new FileWriter("external_links.txt");
malLnks = new FileWriter("malformed_links.txt");
haxLnks = new FileWriter("hash_links.txt");
gudImgs = new FileWriter("good_imgs.txt");
badImgs = new FileWriter("bad_imgs.txt");
extImgs = new FileWriter("external_imgs.txt");
malImgs = new FileWriter("malformed_imgs.txt");
haxImgs = new FileWriter("hash_imgs.txt");
scan(bd); // create the file list arrays
// Close all the files when finished.
gudLnks.close();
badLnks.close();
extLnks.close();
malLnks.close();
haxLnks.close();
gudImgs.close();
badImgs.close();
extImgs.close();
malImgs.close();
haxImgs.close();
} else
System.out.println( bd + " is not a directory." );
}
}