/* PROGRAM SOURCE FILE: TFA.C PROGRAMMER: ROBERT J MORTON 28 JUNE - 10 JULY 1994 TFA.EXE - THE EBS TEXT FILE ANALYSER AND DICTIONARY GENERATOR COUNTS THE BYTES (CHARACTERS), WORDS, SENTENCES AND LINES IN ANY TEXT FILE NAMED ON THE COMMAND LINE. GENERATES A LIST OF ALL THE DIFFERENT WORDS WHICH OCCUR IN THE TEXT AND HOW OFTEN EACH WORD OCCURS. ANALYSES THE VOCABULARY INTO OCCURRENCE BANDS AND DISPLAYS THE RESULTS AND ALSO STORES THEM IN A FILE. CREATES DICTIONARY FILES SORTED INTO ALPHEBETIC ORDER AND INTO ORDER OF DECREASING WORD OCCURRENCE. */ #include // for timer services for end of run beep delay #include // required for the filelength() function #include // required for kbhit() #include // for printf(), fprintf(), fopen(), fclose. #include // for ANSI compatibility of malloc() #include // to set text positions on the screen #include /* to allocate memory for each word captured into the dictionary */ #define FALSE 0 #define TRUE !FALSE #define MAXDIC 32768 // maximum possible number of words in dictionary #define SL 17 // screen line for sort monitor long nc = 0, // number of characters in the file nw = 0, // number of words in the file nl = 0, // number of lines in the file ns = 0; // number of sentences in the file int nW = 0, // current number of words in the dictionary DoDic = FALSE; /* TRUE = we create a dictionary and enough memory is still free for it */ char w[32]; // array to hold word currently being captured for dictionary char *bl = " " " "; // 79-character string FILE *StatsFile; // handle for file in which screen display is stored struct DicEnt { // define a structure for a dictionary entry char _far *p; // pointer to the word (in dynamically-allocated memory) int o; // number of occurrences of word within subject text file char s; // size of the word ie. number of characters it comprises char c; // capitalization status: 0 = word, 1 = Word, 2 = WORD } // declare a far array of far pointers to dictionary entry structures _far * _huge Dic[MAXDIC]; /* ENTERS A NEW WORD INTO THE DICTIONARY IF IT IS NOT ALREADY THERE Takes a pointer to the latest word encountered in the text file, the size of the word (ie the number of characters in it) and the word's capitalization status: 0 = word, 1 = Word, 2 = WORD */ int PutDic(char *pt, int st, int cap) { /* integer variables to contain the indexes for `word-within-dictionary' and `character-within-word' and a boolean that is true if the test word matches a word already in the dictionary. */ int i, j, match; char _far *pd; // points to the new dictionary entry // points to the newly allocated dictionary entry structure struct DicEnt _far *ps; /* if we have now filled the whole of the dictionary pointer array abort the dictionary generation process */ if(nW == MAXDIC){ printf("\nDictionary array too small.\n"); return(FALSE); } /* For each word currently in the dictionary, set pd to point to the first character of this dictionary entry and pre-set the match flag to TRUE then test to see if test word and dictionary entry word are the same length. */ for(i = 0; i < nW; i++) { pd = (ps = *(Dic + i))->p; match = TRUE; // if we have a miss-match, clear the match flag and bail out of the loop if(st == ps->s) for(j = 0; j < st; j++) // for each letter of the word: if(*(pd + j) != *(pt + j)) { match = FALSE; break; } // else set flag to indicate a miss-match if words are different lengths else match = FALSE; /* if the test word was already in the dictionary, increment the number of occurrences of this word. Supplant new lower-case version of word for an upper-case version. Exit because the word was found to be already in the dictionary. */ if(match) ++(ps->o); if(cap == 0 && ps->c > 0) ps->c = 0; return(TRUE); } // end of for() loop /* If we are successful in allocating memory for the new word and also for its data structure, */ if((pd = (char _far *)_fmalloc((size_t)st * sizeof(char))) && (ps = (struct DicEnt _far *)_fmalloc(sizeof(struct DicEnt)))) { // put the new word into its newly-allocated string space for(j = 0; j < st; j++) *(pd + j) = *(pt + j); ps->p = pd; // address of new string to dictionary pointer array ps->s = st; // store the length of the new dictionary entry ps->c = cap; // store capitalization status of new dictionary entry ps->o = 1; // record the first occurrence of this word // set pointer to new structure in access array and increment total words *(Dic + nW++) = ps; _settextposition(3,61); // Set co-ordinates to display the print printf("% 6d",nW); // display number of words captured so far return(TRUE); // return with new word successfully entered } // abort if there is no memory left for more dictionary entries printf("\nInsufficient memory for dictionary.\n"); return(FALSE); } // COUNTS CHARS, WORDS, SENTENCES, LINES IN TEXT FILE void Counter(FILE *fh) { int c, ctr = 0, // ASCII value of current character cc = 0, // char count within the word currently being captured cap = 0, // 0 = all lower-case, 1 = initial cap, >1 = all-caps iw = FALSE, // true when we are currently inside a word es = FALSE, // true when an end-of-sentence has been encountered uc = FALSE; // true when an upper-case letter has been encountered long fl = filelength(fileno(fh)); // length of subject text file // while we have not yet reached the end of the file while((c = getc(fh)) != EOF) { ++nc; // increment the character count if(ctr) // if not yet at zero ctr--; // decrement inter-update counter else { ctr = 1024; _settextposition(2,9); // re-set inter-update counter printf("% 3ld", ((100 * nc) / fl)); // update %age done display } if(c == '\n') // if 'new-line' character, ++nl; // increment line count // if we are in whitespace (ie outside a word) if(c < 33 || c == '-' || c == '/') { iw = FALSE; // reset the inside-a-word flag /* If prev char was a full stop and char before that was not upper case increment the sentence counter and reset the end-of-sentence flag */ if(es && !uc) { ++ns; es = FALSE; } /* If a word has been captured and the dictionary is not out of memory, register the word in the dictionary. */ if(cc && DoDic) DoDic = PutDic(w,cc,cap); cc = 0; // clear character count of captured word cap = 0; // clear capitalisation status } else { // else we are currently inside a word /* if this is the word's first character, set the in-word flag and increment the number of words counter. */ if(!iw) { iw = TRUE; ++nw; } /* If this character marks the end of a sentence, set the end-of-sentence flag. Else it'n not the end of a sentence, so clear the 'end-of-sentence' flag. */ if(c == '.' || c == '?' || c == '!') es = TRUE; else { es = FALSE; /* if not upper case, clear the upper-case flag, else it must be a capital letter, so set the upper-case flag, set the word's capitalization status: 0 = word, 1 = Word, 2 = WORD and convert the letter to lower case. */ if(c < 'A' || c > 'Z') uc = FALSE; else { uc = TRUE; cap = cc + 1; c += 32; } /* if not end of word and the character is a letter, add new char to the word currently being captured for the dictionary. */ if(uc || (c >= 'a' && c <= 'z')) w[cc++] = c; } } } } /* COMPARES THE WEIGHTS OF TWO DICTIONARY ENTRIES px AND py TAKES: pointer to bubble's or stone's data structure pointer to the dictionary entry's data structure swapping criterion flag -1 = swap if <, +1 = swap if > sort switch: 0 = alphabetic sort, 1 = reverse numeric sort. */ int SortSwap(struct DicEnt _far *px, struct DicEnt _far *pi, int f, int sw) { int j, J, // Loop index variables for character scanning loop c, d, // ASCII value variables for character scanning loop sx, si, // lengths of the bubble/stone & dictionary entry words ox, oi, // occurences of bubble/stone & dictionary entry words flag = 0; /* +1 = x-word is greater, 0 = they're identical, -1 = i-word is greater. */ char _far *wx = px->p, // points to the first character of the x-word _far *wi = pi->p; // points to the first character of the i-word /* If set to reverse numeric sort if number of occurrences of the bubble/ stone word > those of i-word flag = -1 because we are sorting into order of DECREASING occurrence and vice versa. */ if(sw) { if((ox = px->o) > (oi = pi->o)) flag = -1; else if(ox < oi) flag = +1; goto A; } // put the length of the shorter entry in J if((sx = px->s) > (si = pi->s)) J = si; else J = sx; /* for each character of the shorter of the two words: if any of string x's corresponding characters is greater, string 'x' is greater; else if any of string y's corresponding characters is greater, string 'y' is greater. */ for(j = 0; j < J; j++) if((c = *(wx + j)) > (d = *(wi + j))) { flag = +1; goto A; } else if(c < d) { flag = -1; goto A; } // if all common characters match, the longer string is the greater if(sx < si) flag = -1; else if(sx > si) flag = +1; /* If swapping criterion is not satisfied, return FALSE. Return TRUE if swapping occurred. */ A:if(flag != f) return(FALSE); return(TRUE); } void Sort(int sw) { // THE EBS BUBBLESTONE SORT int b, s; // element numbers of bubble and stone in dictionary array // while the stone is below the bubble for(b = 0, s = nW -1; b < s; b++, s--) { int i; // inner loop index struct DicEnt _far *pI, // pointer to current dictionary entry's data _far *pB, // pointer to current bubble's data structure _far *pS; // pointer to current stone's data structure // display current number of words left to sort _settextposition(SL + sw, 25); printf("% 6d", nW - s + b); // if bubble is heavier than stone, swap them over. if(SortSwap(pB = *(Dic + b), pS = *(Dic + s), 1, sw)) { pI = pB; pB = pS; pS = pI; } /* for each dictionary entry between upper and lower bounds inclusive, if bubble is heavier than current entry, swap them over; else if the stone is lighter than current entry, swap them over. */ for(i = b + 1; i < s; i++) if(SortSwap(pB, pI = *(Dic + i), +1, sw)) { *(Dic + i) = pB; pB = pI; } else if(SortSwap(pS, pI, -1, sw)) { *(Dic + i) = pS; pS = pI; } *(Dic + b) = pB; // save pointer to lightest entry in the pointer array *(Dic + s) = pS; // save pointer to heaviest entry in the pointer array } } void BeepDelay(void) { // BEEP EVERY 10 SECONDS WHEN PROGRAM HAS FINISHED int flag = TRUE; printf("\nFinished. Press any key to return to DOS prompt."); while(flag) { // set target time to current time + delay duration and sound a beep. time_t t = time(NULL) + 10; printf("\07"); /* While current time has not yet reached the target time, wait; but if a key is hit, set the exit flag. */ while(time(NULL) < t && flag) if(kbhit()) flag = FALSE; } } /* COPIES DICTIONARY FROM THE ARRAY TO THE APPROPRIATE DICTIONARY FILE receives pointer to name of output file. */ int DicOut(char *OutFile) { FILE *fh; // file handle for the output file int i, c; if(fh = fopen(OutFile, "w")) { // if able to open specified output file, for(i = 0; i < nW; i++) { // for each word in the dictionary // pointer to the i th dictionary entry structure struct DicEnt _far *pd = *(Dic + i); int j, J = pd->s, // number of characters in the current dictionary word cap = pd->c; // capitalization status of this word char _far *s = pd->p; // points to start of current dictionary entry /* If no capitalization is required, transfer the dictionary entry into the word buffer array.*/ if(cap == 0) for(j = 0; j < J; j++) *(w + j) = *(s + j); /* Else, if an initial capital is required, capitalize it then transfer the initial letter and transfer the rest of the letters normally.*/ else if(cap == 1) { *w = *s - 32; for(j = 1; j < J; j++) *(w + j) = *(s + j); } /* Else (the whole word is capitalised, so transfer the dictionary entry into the word buffer, capitalizing all letters. */ else for(j = 0; j < J; j++) *(w + j) = *(s + j) - 32; /* Add a terminating null character and write the number of occurrences + the word concerned to output file. */ *(w + J) = '\0'; fprintf(fh, "% 4d %s\n", pd->o, w); } fclose(fh); return(TRUE); // close file and return 'successful' } else return(FALSE); // return FALSE if file could not be opened } /* SUBSTITUTE OR ADD A SPECIFIED EXTENSION TO A FILENAME receives original filename and new extension. */ char *NewExt(char *fn, char *ex) { int i, j, x; char s[128], *p; // find the dot in the filename, or the end of the name if no extension for(i = 0; (x = *(fn + i)) != '\0' && x != '.'; i++) *(s + i) = x; p = s + i; // set pointer p to where the dot should be // put the .STA ending on the file name for(i = 0; i < 5; i++) *(p + i) = *(ex + i); return(s); // return reference to array containing the file name } // SORT & SAVE DICTIONARY IN BOTH ALPHABETIC AND OCCURRENCE ORDERS void SaveDics(char *fn) { static char *c = " dictionary is in file ", *d = "Could not open/write ", *e = " Sort: sorted 00000 of ", *f = "Alphabetic", *g = "Occurrence", *h = " words.", *s; // points to array to hold the output file names // display the alphabetic sort monitor _settextposition(SL,1); printf("%s%s%d%s",f,e,nW,h); Sort(0); // sort the dictionary into alphabetic order of words // reset to overwrite sort monitor message with result _settextposition(SL, 1); /* if the alphabetic dictionary was written to the file ok, then print the confirmation message and, if a stats file exists, print a confirmation message to stats file. */ if(DicOut(s = NewExt(fn, ".DIC"))) { printf("%s%s%s \n",f,c,s); if(StatsFile) fprintf(StatsFile, "%s%s%s\n",f,c,s); } /* Else, print the error message and also print it to the stats file if it exists. */ else { printf("%s%s\n",d,s); if(StatsFile) fprintf(StatsFile,"%s%s\n",d,s); } // display the reverse-numeric sort monitor _settextposition(SL + 1, 1); printf("%s%s%d%s", g, e, nW, h); Sort(1); // sort dictionary into order of decreasing word occurrence // reset to overwrite sort monitor message with result _settextposition(SL + 1, 1); /* If the occurrence dictionary was written to file ok, print the confirmation message; print it also to the Stats File if it exists. */ if(DicOut(s = NewExt(fn, ".OCC"))) { printf("%s%s%s \n",g,c,s); if(StatsFile) fprintf(StatsFile, "%s%s%s\n",g,c,s); } /* Else, print the error message. And to the Stats File if it exists. */ else { printf("%s%s\n",d,s); if(StatsFile) fprintf(StatsFile, "%s%s\n",d,s); } } // DISPLAY THE RESULTS DATA ON THE SCREEN char *ShowResults(char *fn) { char *s; // pointer to statistics file name printf("The file %s contains:\n", fn); printf(" %ld characters, %ld words, %ld" " sentences, %ld lines.\n",nc,nw,ns,nl); if (ns) printf("Average number of words" " per sentence = %ld.\n",nw / ns); if(DoDic) printf("Total vocabulary: %d " "different words.\n",nW); printf("\n"); // if the statistics file can be opened OK if(StatsFile = fopen(s = NewExt(fn, ".STA"),"w")) { fprintf(StatsFile, "TFA.EXE - THE EBS TEXT FILE ANALYSER " "AND DICTIONARY GENERATOR\n\n"); fprintf(StatsFile, "The file %s contains:\n",fn); fprintf(StatsFile, " %ld characters, %ld words, %ld " "sentences, %ld lines.\n",nc,nw,ns,nl); if (ns) fprintf(StatsFile, "Average number of words per" " sentence = %ld.\n",nw / ns); if(DoDic) fprintf(StatsFile, "Total vocabulary:" " %d different words.\n",nW); fprintf(StatsFile,"\n"); return(s); } return(NULL); // could not open the stats file } void Help(void) { printf("\n"); printf("This program lists the number of bytes" " (characters), words, sentences,\n"); printf("lines and the average number of words" " per sentence in the text file\n"); printf("named as its argument. It also optionally" " generates a dictionary of\n"); printf("the words appearing in that file (up" " to a maximum of 32768 words).\n"); printf("\n"); printf("Command line format is: C:>WC FILENAME.TXT /V\n"); printf(" | | | |\n"); printf("DOS prompt \n------------ | | |"); printf("Name of this program ------\n | |"); printf("Name of text file to be analysed -- |\n"); printf("Optional vocabulary generator switch ------\n"); printf("\n"); printf("If in place of /V, you enter /D, two" " dictionary files are produced.\n"); printf("Each lists every word appearing in " "FILENAME.TXT plus the number of times\n"); printf("it occurred. The first, FILENAME.DIC, " "lists the words in alphabetic order.\n"); printf("The second, FILENAME.OCC, lists them " "in order of decreasing occurrence.\n"); printf("FILENAME.STA is a copy of the statistics" " data which appears on the screen.\n"); printf("(c) 1994 R J Morton, 13 Thornbera Close," " Bishops Stortford, Herts. CM23 3NR\n");; } // SHOW THE DISTRIBUTION OF WORD OCCURRENCES IN THE SUBJECT TEXT FILE void ShowStats(void) { // occurrence cut-off limits and their corrosponding occurrence counters unsigned int a[] = {2,6,21,101,501,1001}, b[] = {0,0,0,0,0,0,0}, i, j, k; float x = (float)100 / nW; char *s[] = {" "," "," "," "," "," "," "}; // prefix char for percentage for(i = 0; i < nW; i++) { // for each word in the dictionary // get the number of times it occurred in the subject text file int o = (*(Dic + i))->o, f = FALSE; /* Then, for each range-limit, if the number of occurrences was below the limit, increment the appropriate range counter, set the range- found flag and break out of j-loop. */ for(j = 0; j < 6; j++) if(o < *(a + j)) { (*(b + j))++; f = TRUE; break; } if(!f) (*(b + 6))++; // increment the 'over 1000' counter } /* For each banded value, get the number and percentage of occurrences in band i. */ for(i = 0; i < 7; i++) { j = *(b + i); k = x * j; /* if integral percentage is zero but some words did fall into this band, put a < sign in front and increment the percentage. */ if(k == 0 && j > 0) { *(*(s + i)) = '<'; k++; } *(b + i) = k; // put the percentage in the banding array } printf("%s% 2u%% of words appeared only once\n",*(s + 0),*(b + 0)); printf("%s% 2u%% of words appeared between 2 and 5 times\n", *(s + 1), *(b + 1)); printf("%s% 2u%% of words appeared between 6 and 20 times\n", *(s + 2), *(b + 2)); printf("%s% 2u%% of words appeared between 21 and 100 times\n", *(s + 3), *(b + 3)); printf("%s% 2u%% of words appeared between 101 and 500 times\n", *(s + 4), *(b + 4)); printf("%s% 2u%% of words appeared between 501 and 1000 times\n", *(s + 5), *(b + 5)); printf("%s% 2u%% of words appeared over 1000 times\n\n", *(s + 6), *(b + 6)); if(StatsFile) { // if the stats file is open fprintf(StatsFile,"%s% 2u%% of words appeared only once\n", *(s + 0), *(b + 0)); fprintf(StatsFile,"%s% 2u%% of words appeared between" " 2 and 5 times\n", *(s + 1), *(b + 1)); fprintf(StatsFile,"%s% 2u%% of words appeared between" " 6 and 20 times\n", *(s + 2), *(b + 2)); fprintf(StatsFile,"%s% 2u%% of words appeared between" " 21 and 100 times\n", *(s + 3), *(b + 3)); fprintf(StatsFile,"%s% 2u%% of words appeared between" " 101 and 500 times\n", *(s + 4), *(b + 4)); fprintf(StatsFile,"%s% 2u%% of words appeared between" " 501 and 1000 times\n", *(s + 5), *(b + 5)); fprintf(StatsFile,"%s% 2u%% of words appeared over 1000" " times\n\n", *(s + 6), *(b + 6)); } } main(int argc, char *argv[]) { int c, i; // ASCII value of filename char, index for letter-case converter FILE *fh; // handle for the text-file being processed char *fn = *++argv, // pointer to name of the text file being processed *t = *++argv, // pointer to possible /D switch on command line *sf; // points to name of file where screen display stored // convert filename to upper case for display for(i = 0; (c = *(fn + i)) != '\0'; i++) if(c >= 'a' && c <= 'z') *(fn + i) -= 32; _clearscreen(_GCLEARSCREEN); printf("TFA.EXE - THE EBS TEXT FILE ANALYSER AND DICTIONARY GENERATOR\n"); if(*fn) { // if first command line argument exists, assume it's a filename /* Try to find and open the file to be analysed. If successful, set the 'save to dictionary' flag to false, and display a progress message on Lane 2. */ if (fh = fopen(fn, "r")) { int SaveDic = FALSE; printf("Analysed xx%% of file: %s\n",fn); /* If a command line switch is present, set flag to cause dictionary to be generated. If it is a full dictionary command, set the sort -and-save flag. */ if(t && *t == '/') { DoDic = TRUE; _settextposition(3,1); printf("Compiling dictionary: "); printf("number of different words found so far 00000"); if((c = *++t) == 'D' || c == 'd') SaveDic = TRUE; } /* Count chars, words, sentences, lines. Capture words into dictionary then close the file that was being processed. */ Counter(fh); fclose(fh); /* wipe the file processing progress message messages and show num- bers of chars, words, sentences, lines + words per sentence. */ _settextposition(2,1); printf("%s\n%s", bl, bl); _settextposition(3,1); sf = ShowResults(fn); if(DoDic) ShowStats(); if(sf) // provided stats file name exists, store stats in it printf("A copy of this screen display has been stored in %s\n",sf); if(SaveDic) SaveDics(fn); // create the sorted dictionary files } else printf("File %s not found.\n", fn); // beep every 10 seconds to tell the user we've finished if(DoDic) BeepDelay(); } else Help(); // else display the help text }