/**
* Web Site Titles and Descriptions Length Checker for HTML files..
* @author Robert John Morton
* @version 22 May 2019 */
/* This program list all html files with:
1) titles shorter than 60 characters in title_too_short.txt
2) titles longer than 70 characters in title_too_long.txt
3) descriptions shorter than 140 characters in descr_too_short.txt
4) descriptions longer than 160 characters in descr_too_long.txt */
import java.io.*;
class titles_checker {
private static Writer
ttl, // for HTML files with titles too long
tts, // for HTML files with titles too short
dtl, // for HTML files with descriptions too long
dts; // for HTML files with descriptions too short
private static Reader
aif; // file object for 'articles_index.html'
private static String
fp = "../../articles-index/",
fn = "articles_index.html",
tag = "", // for capturing tag name text
DT = "", // for capturing DT text
DD = "", // for capturing DD text
path = "", // title entry's href path
title = "", // title entry's title text
s = ""; // used to form the length of the title or description
private static int
c = 0, // for input character
tl = 0, // title length
dl = 0, // description length
L = 0; // length of title text
private static boolean
inTag = false, // True indicates we are inside the <> brackets
inDT = false, // True indicates we are reading the title text
inDD = false; // True indicates we are reading the title text
/* Format of the title section of an entry in articles_index.html
001
7-day Week: Its Relation to Moon Phases
Called only from one place in main(). */
private static void parseDT() throws Exception {
DT = DT.trim();
int
x = 0,
ldt = DT.length();
if(ldt > 22) { // length of extraneous tags
/* Chop off the extraneous tags and trim spurious white-space
characters such as C/Rs, NULLs and EOF from the title entry. */
DT = DT.substring(13,ldt - 8).trim();
if((x = DT.indexOf(">")) != -1) { //'>' char link and title text
/* Chop off the extraneous tags and trim spurious white-space
characters such as C/Rs, NULLs and EOF from the path. */
path = DT.substring(0,x).trim();
/* Chop off the extraneous tags and trim spurious white-space
characters such as C/Rs, NULLs and EOF from the title text. */
title = DT.substring(x + 1,DT.length()).trim();
/* Form the length of the title text as a 3-digit
string with leading zeros where necessary. */
int l = title.length();
s = "" + l;
if(l < 100)
s = "0" + l;
if(l < 10)
s = "0" + l;
// Form the output entry of the link + title text
// s = s + " " + path + "\n" + title + "\n\n";
s = path + " " + l + "\n";
if(l < 60)
tts.write(s); // if title too short
else if(l > 70)
ttl.write(s); // if title too long
}
}
DT = "";
inDT = false;
}
/* Format of the description section of an entry in articles_index.html
[23 characters]
Today is always a little bit longer than yesterday and tomorrow
will be a little bit longer than today.
[11 characters]
Called from only one place in main(). */
private static void parseDD() throws Exception {
DD = DD.trim();
int x = 0, ldd = DD.length();
if(ldd > 34) { // length of extraneous tags
/* Chop off the extraneous tags and trim spurious white-space
characters such as C/Rs, NULLs and EOF from description text. */
DD = DD.substring(20,ldd - 10).trim();
/* Form the length of the title text as a 3-digit
string with leading zeros where necessary. */
int l = DD.length();
s = "" + l;
if(l < 100)
s = "0" + l;
if(l < 10)
s = "0" + l;
// Form the output entry of the link + title text
// s = s + " " + path + "\n" + DD + "\n\n";
s = path + " " + l + "\n";
if(l < 140) //if description too short
dts.write(s);
else if(l > 160) //if description too long
dtl.write(s);
}
DD = ""; // clear description string ready for next pass
inDD = false; // and clear the 'doing description' flag
}
public static void main(String args[]) throws Exception {
/* Create a file reader for article_index.html
and file writers for the four results files. */
aif = new FileReader("../../articles-index/articles_index.html");
ttl = new FileWriter("title_too_long.txt");
tts = new FileWriter("title_too_short.txt");
dtl = new FileWriter("descr_too_long.txt");
dts = new FileWriter("descr_too_short.txt");
/* Read and examine each each character in turn from art-
icles_index.html until its end-of-file is encountered. */
try {
while((c = aif.read()) != -1) {
if(c == '<') { // If the new char is an HTML tag start-bracket '<'
inTag = true; // set the 'in tag' flag
tag = ""; // clear the tag string
}
else // else, if we are currently insid
if(inTag) { // a tag [between a '<' and a '>'] ...
if(c == '>') { // if new character is a tag terminator
inTag = false; // baracket '>', clear the 'in tag' flag
/* If we have just captured a or a tag, switch to
capturing title or description respectively, otherwise, if
we have just captured a or a tag, go and parse
the respective content. */
if(tag.equals("dt"))
inDT = true;
else if(tag.equals("dd"))
inDD = true;
else if(tag.equals("/dt"))
parseDT();
else if(tag.equals("/dd"))
parseDD();
}
else
tag += (char)c; // else add the new char to tag string
}
/* Add the current character to the appropriate capture
string for the title or description respectively. */
if(inDT)
DT += (char)c;
else if(inDD)
DD += (char)c;
}
} catch(Exception e) { } // catch 'end-of-file' exception
// Close all files.
aif.close();
ttl.close();
tts.close();
dtl.close();
dts.close();
}
}