/**
  * List the name of each HTML file that contains one or more HTML comments in
    its body text. List any contained comments under the file path+name. 
  * @author Robert John Morton YE572246C
  * @version 07 August 2019 */

/* COMMANDS:
      cd /home/rob/Private/website/webtools/comments-extractor/
      java commentsExtractor  */

import java.io.*;  // for file input/output handling

class commentsExtractor {
  private static Writer 
    td,  // for 'To Do List' comments
    cn,  // for comments that contain people's names
    si,  // for 'supplementary information' comments
    pn,  // for 'personal:' comments
    od,  // for 'Old discarded text' comments
    dh,  // for 'Document History' comments
    kr,  // for all other comments
    gw;  // general writer variable

  private static File fo;  // for the current HTML file being examined

  private static int 
    dir_level = 0,  // current directory level used in scan()
    LenDir = 0,     // length of parent directory path + a terminating '/'
    nf = 0;         // total number of files processed

  private static String 
    /* The names of all the top-level directories in the website that
    must be indexed. There are other top-level directories which are
    excluded from the indexing process for one reason or another. */

    FOLDERS[] = {
      "book","chaos","computers","home","internet","landshare",
      "navigation","poems","radio","science","software"
    },

    EX[] = {  // Standard comments to be excluded from consideration.
      "Start of the left-hand navigation frame",  
      "End of left nav frame. Start of top-right title frame", 
      "End of top-right title frame. Start of bottom-right text frame", 
      "End of the bottom-right scrolling text frame",
      "Start of text frame", "Start of right-hand text frame", 
      "Set the height of the Title Frame (top right of browser window)."
    },

    Blank = "\n\n"
          +"----------------------------------------------------------------"
          +"\n",

    BaseDir = "/home/rob/Private/website",  // default base directory
    SubDir = "",       // sub-directory
    FullPath = "",     // absolute path to directory currently being indexed
    RelFileSpec = "",  // relative filespec of current HTML file
    fp = "";           // file path of current HTML file being examine




  /* Determine what type of comment has been captured
  and write it to the appropriate report file. */

  private static void commentType(String S) {
    boolean
      /* Array of 'this is a long comment' and 'file path+name
      already written' flags, one for each report file. */
      B[] = {false,false,false,false,false,false,false},
      F[] = {false,false,false,false,false,false,false},
      I = true;   // comment to be included in report
    int  fn = 0;  // number of the file to which the comment must be written
    for(int i = 0; i < EX.length; i++)
      if(S.indexOf(EX[i]) != -1) {
        I = false;
        break;
      }
    if(I == false) return;  // exit if comment not to be included

    /* Write it to the appropriate file accord-
    ing to the type of comment encountered. */

    if(S.indexOf("ToDoNote:") != -1) {
      gw = td;
      fn = 0;
    } else if(S.indexOf("SuppInfo:") != -1) {
      gw = si;
      fn = 1;
    } else if(S.indexOf("Personal:") != -1) {
      gw = pn;
      fn = 2;
    } else if(S.indexOf("contains names") != -1) {
      gw = cn;
      fn = 3;
    } else if(S.indexOf("Old discarded text:") != -1) {
      gw = od;
      fn = 4;
    } else if(S.indexOf("DocHist:") != -1) {
      gw = dh;
      fn = 5;
    } else {
      gw = kr;
      fn = 6;
    }

    /* If this is the first comment for this HTML file, write the path+name
    of the report file first. Then form the printable comment string. */

    String C = fp.substring(BaseDir.length() + 1) + "\n";

    try {
      if(fn != 3)
        C = Blank + C;
      if(!F[fn]) {
        gw.write(C);
        F[fn] = true;
      }
      C = S + "\n";

      /* If this be a long comment, then if the previous one were short, skip
      an extra line before writing this long one. Then set the B flag to ind-
      icate to the next pass that this one was long. Then write this comment
      with an extra line after it. Else write this comment, which is short. */

      if(fn != 3) {
        if((S.indexOf("\n") != -1) || (S.length() > 80)) {
          if(!B[fn]) {
            gw.write("\n");
            B[fn] = true;
          }
          gw.write(C + "\n");  // long comment
        } else {
          gw.write(C);         // short comment
          B[fn] = false;
        }
      }
    } catch(Exception e) { }
  }




  /* EXAMINE THE CONTENTS OF THE HTML FILE'S <body> SECTION,
  EXTRACT THE BODY TEXT AND WRITE IT TO THE REPORT FILE. */

  private static void examineThisHTMLfile() {

    boolean           // THE FOLLOWING FLAGS ARE TRUE WHEN:
      inCom = false,  // inputting the content of a comment tag
      inBdy = false;  // inputting from the HTML file's body text

    int
      x;       // for receiving java char input from file stream

    char a, b, c, d, f, g, h;  // moving character train for
    /*      <  b  o  d  y  >   // capturing the required tags
         <  /  b  o  d  y  >
                  <  !  -  -
                     -  -  >   */

    a = b = c = d = f = g = h = 0;  // initialise character train
    String S = "";                  // comment accumulator string

    try {
      FileReader fr = new FileReader(fp);  // open this HTML file
      while((x = fr.read()) != -1) {       // loop broken by End-Of-File

        // shift the characters along the train to make way for the new one.
        a = b; b = c; c = d; d = f; f = g; g = h; h = (char)x;

        /* If we are currently inputting a comment tag, add the new char-
        acter to the comment string. Then if we now have a --> comment
        terminator, analyse and store the comment. */

        if(inCom) {
          S += h;
          if(h == '>' && g == '-' && f == '-') {
            commentType(S);  // determine the type of comment captured

            /* Clear the content of the comment accumulator
            string and clear 'inCom' flag to show that we
            are no longer inside a comment. */

            S = "";
            inCom = false;
          }
        }

        /* Else we are not currently in the middle of a comment, but if
        we are nonetheless within the HTML file's <body> section... */

        else if(inBdy) {

          /* if we encounter a <!-- opening comment tag, prime the tag
          accumulator string with the <!-- and set the inCom flag. */

          if(d == '<' && f == '!' && g == '-' && h == '-') {
            S = "<!--";
            inCom = true;
          }

          /* if we didn't encounter a <!-- tag then see if we have encount-
          ered the file's </body> tag. If so, break out of the while()
          loop to exit this method because we have finished this file. */

          else
          if(h == '>'
          && c == '<'
          && b == '/'
          && c == 'b'
          && d == 'o'
          && f == 'd'
          && g == 'y') 
            break;
        }

        /* We are neither within a comment nor within the HTML file's <body>
        section, so look for the file's <body> tag. If we encounter it
        this pass, set the inBdy flag. */

        else
        if(h == '>'
        && b == '<'
        && c == 'b'
        && d == 'o'
        && f == 'd' 
        && g == 'y') 
          inBdy = true;
      }   // end of while() loop
      fr.close();
    } 
    catch(Exception e) { }
  }




  /* This method is re-entrant. It calls itself. When invoked, it lists the
  files and directories contained within the directory 'FullPath' passed
  to it as its parameter. It then examines each entry in that directory. If
  an entry is a relevant HTML file, it extracts the file's comments from its
  body text and writes them to the report file. If an entry is a directory,
  this method calls itself to deal with that (sub) directory as it is doing
  with the current directory. Thus it can handle any depth of subdirectories
  from the parent. 

  /home/rob/Private/website/book/chap01/chap01/chap01_frame.htm
                           |----------------l-----------------|
                        LenDir                            s.length() */

  private static void scan(String d) throws IOException {

    File fd = new File(d);   // create file object for given directory name
    String D[] = fd.list();  // list all files + sub-dirs in this directory

    // for each HTML file in the CURRENT sub-directory
    for(int i = 0; i < D.length; i++) {

      String              // FOR THE [NEXT] FILE OR SUB-DIRECTORY, CREATE:
        dd = D[i];        // a string to contain its relative path name
      fp = d + "/" + dd;  // current HTML file's file-path

      File fo = new File(fp);  // create a file object for it
      if(fo.isDirectory()) {   // if the object is a directory

        // FALSE indicates that this is NOT one of the directories to be indexed
        boolean flag = false;

        /* If we are in the top-level directory of the website THEN if the
        ith sub-directory is one of those to be indexed, set the flag to
        indicate that this sub-directory must be indexed, else we cannot
        be in the top-level directory, so set the flag to indicate this
        and go index the subdirectory anyway. */

        if(dir_level == 0) {
          for(int j = 0; j < FOLDERS.length; j++)
            if(dd.equals(FOLDERS[j])) {
              flag = true;
              break;
            }
        } else
          flag = true;

        /* Provided this is one of the directories to be indexed and it is
        not an images, applets or java_progs directory, then increment the
        directory level from the one we are now in, re-enter this method,
        then on return, decrement the directory level back to this one. */

        if(flag
        && (dd.indexOf("images") == -1)
        && (dd.indexOf("applets") == -1)
        && (dd.indexOf("mktrman") == -1)
        && (dd.indexOf("java_progs") == -1)
        ) {
          dir_level++; scan(fp); dir_level--;
        }
      }

      /* On the other hand, if it is not a directory but is an existing
      HTML file and we are not currently in the top-level directory,
      then scan this HTML file for meta tags & extract its keywords from
      the keywords meta tag. */

      else
      if(fo.isFile() 
      && fp.endsWith(".html") 
      && !fp.endsWith("_br.html")   // comments duplicated in _br's
      && !fp.endsWith("index.html") 
      && (dir_level > 0)
      ) {
        examineThisHTMLfile();
        nf++;  // increment number of files processed
      }
    }
  }




  public static void main(String args[]) throws IOException {

    /* Provided a command line argument has been entered, get the name of
    the website's base directory from command line. If there is a second
    command line argument, it is the name of a sub directory of the site
    which should be added to the search path. */

    if(args.length > 0) { 
      BaseDir = args[0]; 
      if(args.length > 1)
        SubDir = args[1];
    }

    /* If a search sub-directory has been specified, form the full path,
    otherwise assume indexing starts at the base directory level. */

    if(SubDir != "")
      FullPath = BaseDir + "/" + SubDir;
    else
      FullPath = BaseDir;

    /* Form a file object for the directory to be searched. Then, provided
    that the command line argument is an existing directory, create the
    index and write the data into the report file then close it. */

    LenDir = FullPath.length() + 1;
    File pd = new File(FullPath);
    if(pd.isDirectory()) {
      td = new FileWriter("toDoNotes.txt");
      cn = new FileWriter("containsNames.txt");
      si = new FileWriter("supportInfo.txt");
      pn = new FileWriter("personal.txt");
      od = new FileWriter("oldDiscardedText.txt");
      dh = new FileWriter("documentHistory.txt");
      kr = new FileWriter("otherComments.txt");
      scan(FullPath); 
      kr.write("\n\nTotal number of files processed: " + nf + "\n\n");
      td.close(); cn.close(); si.close();
      pn.close(); 
      od.close();
      dh.close();
      kr.close();
    } else
      System.out.println(FullPath + " is not a directory.");
  }
}