
(* ---------------------------------------------------------------
Title         Q&D Word/Token count
Overview
Usage
Notes
Bugs
Wish List     sort ? bah, let an external program take care of that point...

--------------------------------------------------------------- *)

MODULE wCount;

IMPORT FIO;
IMPORT Str;
IMPORT Lib;
IMPORT Storage;
IMPORT IO;
IMPORT QD_Box; (* required for "candy" local submodule *)

FROM IO IMPORT WrStr,WrLn;
FROM Storage IMPORT Available,ALLOCATE,DEALLOCATE;

FROM QD_Box IMPORT str80, str2, cmdInit, cmdShow, cmdStop, delim,
Work, video, Ltrim, Rtrim, UpperCase, LowerCase, ReplaceChar,
ChkEscape, Waitkey, WaitkeyDelay, Flushkey, IsRedirected, chkJoker,
isOption, GetOptIndex, GetLongCard, GetLongInt, GetString, CharCount,
same, aR, aH, aS, aD, aA, everything, isDirectory, fixDirectory,
str128, str256, Animation, allfiles, Belongs, FixAE, CodePhonetic,
CodeSoundex, CodeSoundexOrg, isReadOnly, LtrimBlanks, RtrimBlanks,
getStrIndex, cmdSHOW,BiosWaitkey,BiosWaitkeyShifted,BiosFlushkey,
str1024, isoleItemS, dmpTTX, str2048, Elapsed, TerminalReadString,
getDosVersion, DosVersion, warning95, runningWindows,
aV, reallyeverything, chkClassicTextMode, setClassicTextMode,
AltAnimation, str16, getCurrentDirectory, setReadWrite,
getFileSize, verifyString, str4096, unfixDirectory,
animShow, animSHOW, animAdvance, animEnd, animClear,
animInit, animGetSdone, anim, cleantabs, UpperCaseAlt, LowerCaseAlt,
completedInit, completedShow, completedSHOW, completedEnd, completed,
removeDups, isValidHDunit, removePhantoms, removeFloppies,
getCDROMunits, getCDROMletters, removeCDROMs, getAllHDunits;

(* ------------------------------------------------------------ *)

CONST
    cr          = CHR(13);
    lf          = CHR(10);
    nl          = cr+lf;
    dquote      = '"';
    dot         = ".";
    star        = "*";
    stardotstar = star+dot+star;
    backslash   = "\";
    space       = " ";
    tab         = CHR(9);
    wi          = 7; (* should do for LONGCARD width *)
    widashes    = "=======";
CONST
    progEXEname   = "WCOUNT";
    progTitle     = "Q&D Word/n-gram Counter";
    progVersion   = "v1.0a";
    progCopyright = "by PhG";
    banner        = progTitle+" "+progVersion+" "+progCopyright;
CONST
    errNone          = 0;
    errHelp          = 1;
    errOption        = 2;
    errTooManyParms  = 3;
    errMissingSpec   = 4;
    errTooManyMatches= 5;
    errNoMatch       = 6;
    errUpperLen      = 7;
    errLowerLen      = 8;
    errExactLen      = 9;
    errConflict      = 10;
    errNonsense      = 11;
    errAllocate      = 12;
    errMode          = 13;
    errJoker         = 14;
    errProblem       = 15;
    errAborted       = 255;

PROCEDURE abort (e : CARDINAL; einfo:ARRAY OF CHAR);

    MODULE message;
    IMPORT Str;
    EXPORT msg3;

    PROCEDURE msg3 (VAR R:ARRAY OF CHAR;S1,S2,S3:ARRAY OF CHAR);
    BEGIN
        Str.Concat(R,S1,S2);Str.Append(R,S3);
    END msg3;

    END message;

CONST
(*
 0        1         2         3         4         5         6         7         8
 12345678901234567890123456789012345678901234567890123456789012345678901234567890
*)
    msgHelp =
banner+nl+
nl+
"Syntax : "+progEXEname+" <file(s)> [option]..."+nl+
nl+
"This program reads tokens from specified text file(s) then builds frequencies."+nl+
'The "divide and conquer" approach is highly recommanded !'+nl+
nl+
"-g    n-gram counter mode (default mode is word counter)"+nl+
"-l:#  n-gram lower length ([1..100], default is 1)"+nl+
"-u:#  n-gram upper length ([1..100], default is 2)"+nl+
"-e:#  n-gram exact length ([1..100], shortcut for -l:# -u:#)"+nl+
"-d    allow [A..Z0..9] (default is [A..Z])"+nl+
"-c    ignore case"+nl+
"-a    ignore accents"+nl+
'-f    do not expand "" to "ae" nor "" to "AE"'+nl+
"-b    use [$00..$20] as token separators (default is ^[A..Z])"+nl+
"-i    individual processing for each file (default is cumulative process)"+nl+
"-o    output raw data to video screen without storing to RAM (-q forced)"+nl+
"-m    make an analysis using sorted output file (only -t taken into account)"+nl+
"-x    do not show header"+nl+
"-l    do not show list"+nl+
"-t    do not show frequency table"+nl+
"-s    do not show summary"+nl+
"-q    no processing eyecandy"+nl+
nl+
"Note -o and -m options may be useful with very large files."+nl+
"If program was aborted using Escape, return code is 255."+nl;

VAR
    S : str256; (* oversized *)
BEGIN
    CASE e OF
    | errHelp :
        WrStr(msgHelp);
    | errOption :
        msg3(S,'Unknown "',einfo,'" option !');
    | errTooManyParms:
        msg3(S,'Useless "',einfo,'" parameter !');
    | errMissingSpec:
        S := "Missing <file(s)> specification !";
    | errTooManyMatches:
        msg3(S,'Too many files match "',einfo,'" specification !');
    | errNoMatch:
        msg3(S,'No file matches "',einfo,'" specification !');
    | errUpperLen:
        msg3(S,'Illegal "',einfo,'" n-gram upper length !');
    | errLowerLen:
        msg3(S,'Illegal "',einfo,'" n-gram lower length !');
    | errExactLen:
        msg3(S,'Illegal "',einfo,'" n-gram exact length !');
    | errConflict:
        S := "-e:# option is a nonsense with any of -u:# and -l:# options !";
    | errNonsense:
        S := "-b and -d options are mutually exclusive !";
    | errAllocate:
        (* not very informative ! *)
        msg3(S,'Storage.ALLOCATE() would not store "',einfo,'" token !');
    | errMode:
        S := "-g option required with -l:#, -u:# and -e:# options !";
    | errJoker:
        S := "-m option will process one file only !";
    | errProblem:
        msg3(S,'Unexpected "',einfo,'" line !');
    | errAborted:
        S := "Aborted by user !";
    ELSE
        S := "This is illogical, Captain ! ;-)";
    END;
    CASE e OF
    | errNone, errHelp :
        ;
    ELSE
        WrStr(progEXEname+" : ");WrStr(S);WrLn;
    END;
    Lib.SetReturnCode(SHORTCARD(e));
    HALT;
END abort;

(* ------------------------------------------------------------ *)

    MODULE candy;
    FROM QD_Box IMPORT animInit, anim, animAdvance, animEnd, animClear;
    EXPORT doEyecandy, CANDYinit,CANDYshow,CANDYdone,CANDYsave,CANDYrestore;

    CONST
        CANDYinit    = 0;
        CANDYshow    = 1;
        CANDYdone    = 2;
        CANDYsave    = 3;
        CANDYrestore = 4;

        steps     = 10;
    VAR
        range,current,savecurrent:LONGCARD;
        advanced,i:CARDINAL;

    PROCEDURE doEyecandy (bargraf:BOOLEAN;what:CARDINAL;v:LONGCARD);
    BEGIN
        IF bargraf THEN
            CASE what OF
            | CANDYinit: (* v is total count *)
                animInit(steps, "[", "]", CHR(46), "", "\/" );
                range:=v DIV steps; INC(range); (* avoid DIV 0 ! *)
                current:=1;
                advanced:=0;
            | CANDYshow: (* v is current i *)
                IF (v DIV range) # current THEN
                    anim(animAdvance);
                    current:=(v DIV range);
                    INC(advanced);
                END;
            | CANDYdone:
                anim(animEnd);anim(animClear);
            | CANDYsave:
                anim(animEnd);anim(animClear);
                savecurrent:=current;
            | CANDYrestore:
                animInit(steps, "[", "]", CHR(46), "", "\/" );
                current:=savecurrent;
                FOR i:=1 TO advanced DO
                    anim(animAdvance);
                END;
            END;
        END;
    END doEyecandy;

    END candy;

(* ------------------------------------------------------------ *)

PROCEDURE sound (freq,duration,pause:CARDINAL);
BEGIN
    Lib.Sound(freq);
    Lib.Delay(duration);
    Lib.NoSound();
    Lib.Delay(pause);
END sound;

PROCEDURE alarm (  );
BEGIN
    sound(55,55,100);
    sound(55,55,10);
END alarm;

PROCEDURE alert (  );
BEGIN
    sound(550,55,100);
    sound(550,55,10);
END alert;

PROCEDURE wrQuoted (S:ARRAY OF CHAR);
BEGIN
    WrLn;WrStr(dquote);WrStr(S);WrStr(dquote);
END wrQuoted;

(* ------------------------------------------------------------ *)

PROCEDURE pad (wi:INTEGER;padchar:CHAR; VAR R:ARRAY OF CHAR);
VAR
    i:CARDINAL;
BEGIN
    FOR i:=Str.Length(R)+1 TO ABS(wi) DO
        IF wi < 0 THEN
            Str.Append(R,padchar);
        ELSE
            Str.Prepend(R,padchar);
        END;
    END;
END pad;

PROCEDURE lcToStr (wi:INTEGER;padchar:CHAR; n:LONGCARD):str80;
VAR
    ok:BOOLEAN;
    R:str80;
BEGIN
    Str.CardToStr ( n, R, 10, ok);
    pad (wi,padchar,R);
    RETURN R;
END lcToStr;

(* ------------------------------------------------------------ *)

CONST
    rem = ";";
    hdr = rem+" ";
VAR
    heapsizebefore : CARDINAL;

PROCEDURE markmem (  );
BEGIN
    heapsizebefore :=Storage.HeapTotalAvail(Storage.MainHeap);
END markmem;

PROCEDURE showmem(S:ARRAY OF CHAR);
CONST
    msgBefore   = hdr+"Byte(s) free before : ";
    msgUsed     = hdr+"Byte(s) grabbed     : ";
    msgFreed    = hdr+"Byte(s) released    : ";
    msgNow      = hdr+"Byte(s) free now    : ";
    w           = 6;
VAR
    heapsize    : CARDINAL; (* in PARAGRAPHS and not in bytes ! help is wrong ! *)
    n           : LONGCARD;
BEGIN
    WrLn;
    heapsize :=Storage.HeapTotalAvail(Storage.MainHeap);

(*
    n :=16 * LONGCARD(heapsizebefore);
    WrStr(msgBefore); IO.WrLngCard(n,w);WrLn;

    IF heapsizebefore < heapsize THEN
        n := 16 * LONGCARD(heapsize-heapsizebefore);
        WrStr(msgFreed); IO.WrLngCard(n,w);WrLn;
    ELSE
        n := 16 * LONGCARD(heapsizebefore-heapsize);
        WrStr(msgUsed); IO.WrLngCard(n,w);WrLn;
    END;
*)

    n :=16 * LONGCARD(heapsize);
    WrStr(msgNow); IO.WrLngCard(n,w);WrStr(" (");WrStr(S);WrStr(")");WrLn;
END showmem;

(* ------------------------------------------------------------ *)

CONST
    ignore = " ";

    Talpha = "                                "+
             "                                "+
             " ABCDEFGHIJKLMNOPQRSTUVWXYZ     "+
             " abcdefghijklmnopqrstuvwxyz     "+
             "ueaaaaeeeiiiAAEooouuyOU     "+
             "aiounN                          "+
             "                                "+
             "                                ";

PROCEDURE map (c : CHAR ;table : ARRAY OF CHAR) : CHAR;
BEGIN
    RETURN table [ ORD (c) ];
END map;

PROCEDURE removeAccents (VAR S : ARRAY OF CHAR);
VAR
    len : CARDINAL;
    i   : CARDINAL;
    c   : CHAR;
    ch  : CHAR;
BEGIN
    len := Str.Length(S);
    IF len = 0 THEN RETURN; END;         (* this WAS a problem ! *)
    FOR i := 0 TO (len - 1) DO
        c  := S[i];
        ch := map(c,Talpha);
        IF ch # ignore THEN S[i] := ch; END;
    END;
END removeAccents;

PROCEDURE isAlpha(ch:CHAR):BOOLEAN;
BEGIN
    RETURN map(ch,Talpha) # ignore;
END isAlpha;

PROCEDURE isOK (ch:CHAR;allowdigits:BOOLEAN) : BOOLEAN;
CONST
    digits = "0123456789";
BEGIN
    IF isAlpha(ch) THEN RETURN TRUE; END;
    IF allowdigits THEN
        RETURN (Str.CharPos(digits,ch) # MAX(CARDINAL));
    ELSE
        RETURN FALSE;
    END;
END isOK;

PROCEDURE isBlank(ch:CHAR):BOOLEAN;
BEGIN
    (*
    CASE ch OF
    | space,tab,cr,lf:
        RETURN TRUE;
    ELSE
        RETURN FALSE;
    END;
    *)
    RETURN (ORD(ch) <= ORD(space));
END isBlank;

PROCEDURE dofixae (VAR S : ARRAY OF CHAR):CARDINAL;
CONST
    LCae = ""; (* 145 *)
    UCae = ""; (* 146 *)
VAR
    i,p : CARDINAL;
    ch:CHAR;
    replace:str2;
BEGIN
    FOR i:=1 TO 2 DO
        CASE i OF
        | 1: ch:=LCae;replace:="ae";
        | 2: ch:=UCae;replace:="AE";
        END;
        LOOP
            p:=Str.CharPos(S,ch);
            IF p=MAX(CARDINAL) THEN EXIT; END;
            Str.Subst(S,ch,replace);
        END;
    END;
    RETURN Str.Length(S);
END dofixae;

(* ------------------------------------------------------------ *)

TYPE
    pstr = POINTER TO tokendata;
    tokendata = RECORD
        total  : LONGCARD;
        len    : SHORTCARD;
        string : CHAR;
    END;
    NODE = POINTER TO tokentype; (* pointer to a wordnode structure *)
    tokentype = RECORD
        left,right : NODE;
        data       : pstr;
    END;

VAR
    gwordtree : NODE;
CONST
    mingram = 1;
    maxgram = 100;
    undefined=MAX(CARDINAL);
VAR
    (* avoid stack problems thanks to globerks *)
    gwcount,gtwcount:LONGCARD; (* unique tokens, total tokens *)
    gdump:BOOLEAN;
    gskipped,gfilesize:LONGCARD;
TYPE
    tokinfotype = RECORD
        uniques : LONGCARD;
        freq    : LONGCARD;
    END;

PROCEDURE initCounters (counts,showlist:BOOLEAN );
BEGIN
    IF counts THEN
        gwcount   :=0;
        gtwcount  :=0;
        gdump     :=showlist;
    ELSE
        gfilesize :=0;
        gskipped  :=0;
    END;
END initCounters;

PROCEDURE initTree ();
BEGIN
    gwordtree := NIL;
END initTree;

PROCEDURE killNode ( p:NODE);
VAR
    len,used:CARDINAL;
    tmp:pstr;
    pl,pr:NODE;
BEGIN
    IF p=NIL THEN RETURN;  END;
    tmp:=p^.data;
    len:=CARDINAL(tmp^.len);
    used:=SIZE(tokendata)-1+len;
    DEALLOCATE(tmp,used);
    pl:=p^.left;
    pr:=p^.right;
    DEALLOCATE(p,SIZE(tokentype));
    killNode( pl);
    killNode( pr);
END killNode;

PROCEDURE killTree ();
BEGIN
    killNode(gwordtree) ;
END killTree;

PROCEDURE strstore (VAR ok:BOOLEAN; S:ARRAY OF CHAR ): pstr;
VAR
    len,needed:CARDINAL;
    tmp:pstr;
BEGIN
    len:=Str.Length(S);
    needed:=SIZE(tokendata)-1+len;
    ok:=Storage.Available(needed);
    IF ok=FALSE THEN RETURN NIL;END;
    ALLOCATE(tmp, needed);
    tmp^.total:=1;
    tmp^.len := SHORTCARD(len);
    Lib.FastMove( ADR(S),ADR(tmp^.string),len);
    RETURN tmp;
END strstore;

PROCEDURE newnode (VAR p:NODE; snorm:ARRAY OF CHAR):BOOLEAN ;
VAR
    nlen,clen:CARDINAL;
    ok:BOOLEAN;
BEGIN
    IF Storage.Available( SIZE(tokentype) )=FALSE THEN RETURN FALSE; END;
    ALLOCATE (p,SIZE(tokentype));
    p^.left:=NIL;
    p^.right:=NIL;
    p^.data:=strstore(ok,snorm);
    RETURN ok;
END newnode;

PROCEDURE addnode (newptr:NODE);
VAR
    p:NODE;
    s1,s2:str128; (* oversized ! *)
    len:CARDINAL;
BEGIN
    IF gwordtree = NIL THEN
        gwordtree := newptr;
        RETURN;
    END;

	(* insert into binary tree for normal search *)
	
    p:=gwordtree;
    WHILE p # NIL DO
        len:=CARDINAL(p^.data^.len);
        Lib.FastMove( ADR(p^.data^.string),ADR(s1),len);
        s1[len]:=0C;
        len:=CARDINAL(newptr^.data^.len);
        Lib.FastMove( ADR(newptr^.data^.string),ADR(s2),len);
        s2[len]:=0C;
        IF Str.Compare(s1,s2) < 0 THEN
            IF p^.left # NIL THEN
                p := p^.left;
            ELSE
                p^.left := newptr;
                p := NIL;
            END;
        ELSE
            IF p^.right # NIL THEN
                p := p^.right;
            ELSE
                p^.right := newptr;
                p := NIL;
            END;
        END;
    END;

END addnode;

PROCEDURE findMatch (ID:ARRAY OF CHAR):NODE;
VAR
    p   : NODE;
    len : CARDINAL;
    rc  : INTEGER;

    matchID : str128; (* oversized ! *)
BEGIN
    p := gwordtree;
    WHILE p # NIL DO
        len:=CARDINAL(p^.data^.len);
        Lib.FastMove(ADR(p^.data^.string)  ,ADR(matchID), len);
        matchID[len]:=0C;
        rc:=Str.Compare(matchID,ID);
        IF rc=0 THEN
            RETURN p;
        END;
        IF rc <0 THEN
            p:=p^.left;
        ELSE
            p:=p^.right;
        END;
    END;
    RETURN NIL;
END findMatch;

CONST
    sInfo    = 'L = $,  N = $,  C = $, S = "$"';
    sInfoAlt = 'L = $,  N = $,  C = $, S = $'; (* word/token will have double quotes *)

PROCEDURE dumpNode (VAR errcode:CARDINAL; p:NODE);
VAR
    len:CARDINAL;
    S:str128; (* oversized ! *)
    R:str256; (* oversized just in case ! *)
    esc:BOOLEAN;
BEGIN
    IF errcode # errNone THEN RETURN; END;
    IF p=NIL THEN RETURN;  END;

    len:=CARDINAL(p^.data^.len);
    IF gdump THEN
        Lib.FastMove(ADR(p^.data^.string),ADR(S),len);
        S[len]:=0C;

        Str.Copy(R,sInfo);
        Str.Subst(R,"$",lcToStr(2," ", LONGCARD(len) ));
        Str.Subst(R,"$",lcToStr(wi," ",  p^.data^.total    ) );
        Str.Subst(R,"$",lcToStr(wi," ",  LONGCARD(len)*p^.data^.total    ) );
        Str.Subst(R,"$",S);
        WrStr(R);WrLn;
    END;

    INC(gwcount);
    INC(gtwcount,p^.data^.total);

    IF ChkEscape() THEN errcode:=errAborted; END;
    dumpNode(errcode, p^.left);
    dumpNode(errcode, p^.right);
END dumpNode;

PROCEDURE dumpTree (VAR errcode:CARDINAL);
CONST
    msg="Working...";
BEGIN
    IF NOT(gdump) THEN video(msg,TRUE);END;
    errcode:=errNone;
    dumpNode(errcode,gwordtree) ;
    IF NOT(gdump) THEN video(msg,FALSE);END;
END dumpTree;

CONST
    sBytesRead   = hdr+"Bytes read              : ";
    sFilteredOut = hdr+"Characters filtered out : ";
    sTokens      = hdr+"Tokens                  : ";
    sWords       = hdr+"Unique words            : ";
    sTotalWords  = hdr+"Total words             : ";

    sPath        = hdr+"Path                    : ";
    sCase        = hdr+"Ignore case             : ";
    sAccents     = hdr+"Ignore accents          : ";
    sAE          = hdr+'Expand "" and ""      : ';
    sSeparators  = hdr+"[$00..$20] separators   : ";
    sSet         = hdr+"Accept [A..Z0..9]       : ";
    sMode        = hdr+"Word counter mode       : ";
    sRange       = hdr+"n-gram range            : ";
    sCumulative  = hdr+"Cumulative results      : ";
    sRaw         = hdr+"Raw output              : ";

PROCEDURE dumpStats (showme,wordmode,outputnow:BOOLEAN   );
VAR
    S:str128;
BEGIN
    IF NOT(showme) THEN RETURN; END;
    WrLn;
    Str.Concat(S,sBytesRead     ,lcToStr(wi," ",gfilesize));WrStr(S);WrLn;
    Str.Concat(S,sFilteredOut   ,lcToStr(wi," ",gskipped));WrStr(S);WrLn;
    IF wordmode THEN
        Str.Concat(S,sTotalWords,lcToStr(wi," ",gtwcount));WrStr(S);WrLn;
      IF NOT(outputnow) THEN
        Str.Concat(S,sWords     ,lcToStr(wi," ",gwcount));WrStr(S);WrLn;
      END;
    ELSE
      IF NOT(outputnow) THEN
        Str.Concat(S,sTokens      ,lcToStr(wi," ",gwcount));WrStr(S);WrLn;
      END;
    END;
END dumpStats;

PROCEDURE dumpFlags (showme,keepcase,keepaccents,keepae,useblanks,allowdigits,
                    wordmode,cumulative,outputnow:BOOLEAN;lowerlen,upperlen:CARDINAL;
                    spec:ARRAY OF CHAR  );
    MODULE message;
    IMPORT Str;
    FROM IO IMPORT WrStr,WrLn;
    EXPORT msgbool;

    PROCEDURE msgbool (flag:BOOLEAN;S:ARRAY OF CHAR);
    BEGIN
        WrStr(S);
        IF flag THEN
            WrStr("YES");
        ELSE
            WrStr("no");
        END;
        WrLn;
    END msgbool;

    END message;
VAR
    S : str128;
BEGIN
    IF NOT(showme) THEN RETURN;END;
    WrStr(sPath);WrStr(spec);WrLn;
    msgbool(NOT(keepcase)    ,sCase);
    msgbool(NOT(keepaccents) ,sAccents);
    msgbool(NOT(keepae)      ,sAE);
    msgbool(useblanks        ,sSeparators);
    msgbool(allowdigits      ,sSet);
    msgbool(wordmode         ,sMode);
    msgbool(cumulative       ,sCumulative);
    IF NOT(wordmode) THEN
        WrStr(sRange);
        Str.Concat(S,lcToStr(1," ",LONGCARD (lowerlen)),"..");
        Str.Append(S,lcToStr(1," ",LONGCARD (upperlen)));
        Str.Prepend(S,"[");Str.Append(S,"]");
        WrStr(S);WrLn;
    END;
    msgbool(outputnow        ,sRaw);
END dumpFlags;

PROCEDURE scanNode (VAR remaining,uniques,total:LONGCARD;
                   len:SHORTCARD;p:NODE);
BEGIN
    IF p=NIL THEN RETURN;  END;

    IF len = p^.data^.len THEN
        INC(uniques);
        INC(total,p^.data^.total);
        DEC(remaining);
    END;

    scanNode(remaining,uniques,total,len,p^.left);
    scanNode(remaining,uniques,total,len,p^.right);

END scanNode;

CONST
    sfreqInfo   = hdr+"L = $,  uniques = $,  total = $";
    sfreqFinal  = hdr+"    $             $           $";

PROCEDURE dumpFreq (showme:BOOLEAN   );
VAR
    remaining,uniques,total,granduniques,grandtotal:LONGCARD;
    len:SHORTCARD;
    R:str128;
BEGIN
    IF NOT(showme) THEN RETURN; END;
    WrLn;
    granduniques:=0;
    grandtotal:=0;
    remaining:=gwcount;
    len:=1;
    WHILE remaining > 0 DO
        uniques:=0;
        total:=0;
        scanNode(remaining,uniques,total,len,gwordtree);
        INC(granduniques,uniques);
        INC(grandtotal,total);
        Str.Copy(R,sfreqInfo);
        Str.Subst(R,"$",lcToStr(2," ", LONGCARD(len) ));
        Str.Subst(R,"$",lcToStr(wi," ",  uniques ) );
        Str.Subst(R,"$",lcToStr(wi," ",  total ) );
        WrStr(R);WrLn;

        INC(len);
    END;

    Str.Copy(R,sfreqFinal);
    Str.Subst(R,"$","  " ) ;
    Str.Subst(R,"$",widashes);
    Str.Subst(R,"$",widashes);
    WrStr(R);WrLn;

    Str.Copy(R,sfreqFinal);
    Str.Subst(R,"$","  " ) ;
    Str.Subst(R,"$",lcToStr(wi," ",  granduniques ) );
    Str.Subst(R,"$",lcToStr(wi," ",  grandtotal ) );
    WrStr(R);WrLn;

END dumpFreq;

(* ------------------------------------------------------------ *)

CONST
    IObufferSize = (8 * 512) + FIO.BufferOverhead;
    firstBufferByte = 1;
    lastBufferByte  = IObufferSize;
VAR
    ioBuffer : ARRAY [1..IObufferSize] OF BYTE;

PROCEDURE getToken (useblanks,allowdigits:BOOLEAN;hnd : FIO.File;
                   VAR len:CARDINAL; VAR token:ARRAY OF CHAR );
VAR
    ch    : CHAR;
    state : (waiting,getting);
    got   : CARDINAL;
    ok    : BOOLEAN;
BEGIN
    Str.Copy(token,"");
    state := waiting;
    len   := 0;
    LOOP
        got := FIO.RdBin(hnd,ch,1);
        IF got = 0 THEN EXIT; END;
        CASE state OF
        | waiting :
            IF useblanks THEN
                ok:=NOT(isBlank(ch));
            ELSE
                ok:=isOK(ch,allowdigits);
            END;
            IF ok THEN
                Str.Copy(token,ch);
                len:=1;
                state :=getting;
            ELSE
                INC(gskipped);
            END;
        | getting :
            IF useblanks THEN
                ok:=NOT(isBlank(ch));
            ELSE
                ok:=isOK(ch,allowdigits);
            END;
            IF ok THEN
                Str.Append(token,ch);
                INC(len);
            ELSE
                INC(gskipped);
                EXIT;
            END;
        END;
    END;
END getToken;

PROCEDURE buildGrams (VAR errcode:CARDINAL;VAR einfo:ARRAY OF CHAR;
                     lowerlen,upperlen:CARDINAL;
                     keepcase,keepaccents,keepae,
                     useblanks,allowdigits,wordmode,outputnow,verbose:BOOLEAN;
                     f:ARRAY OF CHAR);
CONST
    msgProcessing = "Processing ";
VAR
    hin:FIO.File;
    token:str128; (* should do ! *)
    i,len:CARDINAL;
    j,lim:INTEGER;
    ptr,matchptr:NODE;
    ok:BOOLEAN;
    msg:str128;
    gram:str128; (* str16 was enough but leave room for unlikely extension ! *)
BEGIN
    errcode:=errNone;

    FIO.EOF:=FALSE; (* silly JPI quirks/bugs ! *)
    hin:=FIO.OpenRead(f);
    FIO.AssignBuffer(hin,ioBuffer);

    Str.Concat(msg,msgProcessing,f);Str.Append(msg," ");
    IF NOT(outputnow) THEN video(msg,TRUE); END;
    doEyecandy(verbose,CANDYinit, FIO.Size(hin) );

    INC(gfilesize,FIO.Size(hin));

    LOOP
        IF FIO.EOF THEN EXIT; END;
        doEyecandy(verbose,CANDYshow, FIO.GetPos(hin) );
        getToken(useblanks,allowdigits,hin,len,token);
        IF len=0 THEN EXIT;END;
        IF NOT(keepae) THEN len:=dofixae(token);END;
        IF keepcase THEN
            IF keepaccents THEN
                ;
            ELSE
                removeAccents(token);
            END;
        ELSE
            IF keepaccents THEN
                LowerCaseAlt(token);
            ELSE
                LowerCase(token);
            END;
        END;
        IF wordmode THEN
            IF outputnow THEN
                wrQuoted(token);
                INC(gtwcount); (* update total words is ok here *)
            ELSE
                matchptr:=findMatch(token);
                IF matchptr=NIL THEN
                    IF newnode(ptr, token)=FALSE THEN
                        errcode:=errAllocate;
                        Str.Concat(einfo,token," (");Str.Append(einfo,f);Str.Append(einfo,")");
                        EXIT;
                    END;
                    addnode(ptr);
                ELSE
                    INC( matchptr^.data^.total );
                END;
            END;
        ELSE
            FOR i:=lowerlen TO upperlen DO
                lim:=INTEGER(len)-INTEGER(i)+1;
                j:=0;
                LOOP
                    IF j>=lim THEN EXIT; END;
                    Str.Slice(gram,token, CARDINAL(j),CARDINAL(i) );
                    IF outputnow THEN
                        wrQuoted(gram); (* update gwcount unique tokens would be a nonsense here *)
                    ELSE
                        matchptr:=findMatch(gram);
                        IF matchptr=NIL THEN
                            IF newnode(ptr, gram)=FALSE THEN
                                errcode:=errAllocate;
                                Str.Concat(einfo,token," (");Str.Append(einfo,f);Str.Append(einfo,")");
                                EXIT;
                            END;
                            addnode(ptr);
                        ELSE
                            INC( matchptr^.data^.total );
                        END;
                    END;
                    INC(j);
                END;
                IF errcode # errNone THEN EXIT;END;
            END;
        END;
        IF outputnow THEN
            IF ChkEscape() THEN errcode:=errAborted;EXIT; END;
        END;
    END;
    FIO.Close(hin);

    doEyecandy(verbose,CANDYdone,0);
    IF NOT(outputnow) THEN video(msg,FALSE); END;
END buildGrams;

(* ------------------------------------------------------------ *)
(* ------------------------------------------------------------ *)

CONST
    minlen = 1; (* 0 would be a nonsense here ! and we'll need 1-1 *)
    maxlen = 128; (* should do ! *)
TYPE
    statstype = RECORD
        uniques,total:LONGCARD;
    END;
VAR
    stats : ARRAY [minlen..maxlen] OF statstype;

PROCEDURE dumpFreqRemade (showme:BOOLEAN   );
VAR
    remaining,uniques,total,granduniques,grandtotal:LONGCARD;
    R:str128;
    len,lastlen:CARDINAL;
BEGIN
    IF NOT(showme) THEN RETURN; END;

    lastlen:=maxlen;
    LOOP
        IF stats[lastlen].uniques # 0 THEN EXIT; END;
        DEC(lastlen);
        IF lastlen < minlen THEN RETURN; END;
    END;

    WrLn;
    granduniques:=0;
    grandtotal:=0;

    FOR len:=minlen TO lastlen DO
        uniques:=stats[len].uniques;
        total:=stats[len].total;
        INC(granduniques,uniques);
        INC(grandtotal,total);
        Str.Copy(R,sfreqInfo);
        Str.Subst(R,"$",lcToStr(2," ", LONGCARD(len) ));
        Str.Subst(R,"$",lcToStr(wi," ",  uniques ) );
        Str.Subst(R,"$",lcToStr(wi," ",  total ) );
        WrStr(R);WrLn;
    END;

    Str.Copy(R,sfreqFinal);
    Str.Subst(R,"$","  " ) ;
    Str.Subst(R,"$",widashes);
    Str.Subst(R,"$",widashes);
    WrStr(R);WrLn;

    Str.Copy(R,sfreqFinal);
    Str.Subst(R,"$","  " ) ;
    Str.Subst(R,"$",lcToStr(wi," ",  granduniques ) );
    Str.Subst(R,"$",lcToStr(wi," ",  grandtotal ) );
    WrStr(R);WrLn;
END dumpFreqRemade;

PROCEDURE parseLines (VAR PB:ARRAY OF CHAR; f:ARRAY OF CHAR):CARDINAL;
VAR
    hin:FIO.File;
    S,sPrev,R:str128;
    rc,len:CARDINAL;
    total:LONGCARD;
    what:(ignore,valid);
BEGIN
    FOR len:=minlen TO maxlen DO stats[len].uniques:=0;stats[len].total:=0;END;

    sPrev:="";
    total:=0;
    len:=0;
    rc:=errNone;
    FIO.EOF:=FALSE; (* silly JPI quirks/bugs ! *)
    hin:=FIO.OpenRead(f);
    FIO.AssignBuffer(hin,ioBuffer);
    LOOP
        IF FIO.EOF THEN EXIT;END;
        FIO.RdStr(hin,S);
        LtrimBlanks(S);
        RtrimBlanks(S);
        CASE S[0] OF
        | 0C, rem :
            what:=ignore; (* empty or remark *)
        | dquote :
            what:=valid;
            IF same(S,sPrev) THEN
                INC(total);
            ELSE
                IF len # 0 THEN
                    INC( stats[len].uniques);
                    INC( stats[len].total, total);
                    Str.Copy(R,sInfoAlt); (* no inner quotes ! *)
                    Str.Subst(R,"$",lcToStr( 2," ", LONGCARD( len )));
                    Str.Subst(R,"$",lcToStr(wi," ", total ) );
                    Str.Subst(R,"$",lcToStr(wi," ", LONGCARD( len ) * total ));
                    Str.Subst(R,"$",sPrev);
                    WrStr(R);WrLn;
                END;
                Str.Copy(sPrev,S);
                total:=1;
                len:=Str.Length(sPrev)-2; (* no quotes ! *)
            END;
        ELSE
            Str.Copy(PB,S);
            rc:=errProblem;
            EXIT;
        END;
        IF ChkEscape() THEN
            Str.Copy(PB,"");
            rc:=errAborted;
            EXIT;
        END;
    END;
    FIO.Close(hin);
    IF rc = errNone THEN
        IF total # 0 THEN
                IF len # 0 THEN
                    INC( stats[len].uniques);
                    INC( stats[len].total, total);
                    Str.Copy(R,sInfoAlt); (* no inner quotes ! *)
                    Str.Subst(R,"$",lcToStr( 2," ", LONGCARD( len )));
                    Str.Subst(R,"$",lcToStr(wi," ", total ) );
                    Str.Subst(R,"$",lcToStr(wi," ", LONGCARD( len ) * total ));
                    Str.Subst(R,"$",sPrev);
                    WrStr(R);WrLn;
                END;
        END;
    END;
    RETURN rc;
END parseLines;

(* ------------------------------------------------------------ *)
(* ------------------------------------------------------------ *)

CONST
    firstfile = 1;
    maxfile   = 1000;
VAR
    filename  : ARRAY [firstfile..maxfile] OF str16; (* f8e3 *)
    lastfile  : CARDINAL;

PROCEDURE readMatching(spec:ARRAY OF CHAR;VAR lastfile:CARDINAL);
VAR
    found : BOOLEAN;
    entry : FIO.DirEntry;
BEGIN
    lastfile := firstfile-1;
    found := FIO.ReadFirstEntry(spec,allfiles,entry);
    WHILE found DO
        INC(lastfile);
        IF lastfile > maxfile THEN RETURN;END;
        Str.Copy(filename[lastfile],entry.Name);
        found := FIO.ReadNextEntry(entry);
    END;
END readMatching;

PROCEDURE buildbase (spec:ARRAY OF CHAR;VAR base:ARRAY OF CHAR);
VAR
    u,d,f8,e3 : str128; (* oversized just in CASE -- e3 include dot ! *)
BEGIN
    Lib.SplitAllPath(spec,u,d,f8,e3);
    Lib.MakeAllPath(base,u,d,"","");
    UpperCase(base);
END buildbase;

(* fix common cases  : "." -- "*\" -- "\*.*" *)

PROCEDURE fixdirspec (VAR R : ARRAY OF CHAR  );
VAR
    len : CARDINAL;
BEGIN
    len := Str.Length(R);
    IF len = 0 THEN RETURN; END; (* no risk ! *)
    IF same(R,dot) THEN Str.Copy(R,stardotstar); RETURN; END;        (* "." becomes "*.*" *)
    IF R[len-1]=backslash THEN Str.Append(R,stardotstar);RETURN;END; (* "*\" becomes "*\*.*" *)
END fixdirspec;

PROCEDURE getCard (VAR v:CARDINAL;
                   lower,upper: CARDINAL;S:ARRAY OF CHAR):BOOLEAN;
VAR
    lc:LONGCARD;
    ok:BOOLEAN;
BEGIN
    IF GetLongCard(S,lc)=FALSE THEN RETURN FALSE; END;
    IF lc < LONGCARD(lower) THEN RETURN FALSE; END;
    IF lc > LONGCARD(upper) THEN RETURN FALSE; END;
    v:=CARDINAL(lc);
    RETURN TRUE;
END getCard;

(* ------------------------------------------------------------ *)

VAR
    parmcount,i,opt,lowerlen,upperlen,exactlen:CARDINAL;
    S,R,spec,path,base:str128;
    state : (waiting,gotspec);
    keepcase,keepaccents,keepae,useblanks,allowdigits:BOOLEAN;
    showsummary,showfreq,showflags,showlist,wordmode,cumulative:BOOLEAN;
    outputnow,doanalysis,verbose:BOOLEAN;
    DEBUG:BOOLEAN;
    errcode:CARDINAL;
    einfo:str128;
BEGIN
    FIO.IOcheck := FALSE;
    WrLn; (* yes, here now for help, errors, etc. *)

    parmcount := Lib.ParamCount();
    IF parmcount=0 THEN abort(errHelp,"");END;

    DEBUG       := FALSE;
    lowerlen    := undefined;
    upperlen    := undefined;
    exactlen    := undefined;
    keepcase    := TRUE;
    keepaccents := TRUE;
    keepae      := FALSE;
    useblanks   := FALSE;
    allowdigits := FALSE;
    wordmode    := TRUE;
    cumulative  := TRUE;
    showflags   := TRUE;
    showlist    := TRUE;
    showfreq    := TRUE;
    showsummary := TRUE;
    outputnow   := FALSE;
    doanalysis  := FALSE;
    verbose     := TRUE;

    state :=waiting;
    FOR i := 1 TO parmcount DO
        Lib.ParamStr(S,i); cleantabs(S);
        Str.Copy(R,S);
        UpperCase(R);
        IF isOption(R) THEN
            opt := GetOptIndex(R, "?"+delim+"H"+delim+"HELP"+delim+
                                  "U:"+delim+"UPPER:"+delim+
                                  "L:"+delim+"LOWER:"+delim+
                                  "E:"+delim+"EXACT:"+delim+
                                  "C"+delim+"IGNORECASE"+delim+
                                  "A"+delim+"IGNOREACCENTS"+delim+
                                  "F"+delim+"NOFIXAE"+delim+
                                  "B"+delim+"BLANKS"+delim+
                                  "D"+delim+"DIGITS"+delim+
                                  "Q"+delim+"QUIET"+delim+
                                  "G"+delim+"GRAM"+delim+
                                  "I"+delim+"INDIVIDUAL"+delim+
                                  "X"+delim+"NOFLAGS"+delim+
                                  "L"+delim+"NOLIST"+delim+
                                  "T"+delim+"NOTABLE"+delim+
                                  "S"+delim+"NOSUMMARY"+delim+
                                  "O"+delim+"OUTPUT"+delim+
                                  "M"+delim+"MAKE"+delim+
                                  "DEBUG"
                              );
            CASE opt OF
            | 1,2,3 : abort(errHelp,"");
            | 4,5:    IF exactlen # undefined THEN abort(errConflict,"");END;
                      IF getCard (upperlen, mingram,maxgram,R)=FALSE THEN
                          abort(errUpperLen,S);
                      END;
            | 6,7:    IF exactlen # undefined THEN abort(errConflict,"");END;
                      IF getCard (lowerlen, mingram,maxgram,R)=FALSE THEN
                          abort(errLowerLen,S);
                      END;
            | 8,9:    IF ((lowerlen # undefined) OR (upperlen # undefined)) THEN
                          abort(errConflict,"");
                      END;
                      IF getCard (exactlen, mingram,maxgram,R)=FALSE THEN
                          abort(errExactLen,S);
                      END;
            | 10,11:  keepcase    := FALSE;
            | 12,13:  keepaccents := FALSE;
            | 14,15:  keepae      := TRUE;
            | 16,17:  useblanks   := TRUE;
            | 18,19:  allowdigits := TRUE;
            | 20,21:  verbose     := FALSE;
            | 22,23:  wordmode    := FALSE;
            | 24,25:  cumulative  := FALSE;
            | 26,27:  showflags   := FALSE;
            | 28,29:  showlist    := FALSE;
            | 30,31:  showfreq    := FALSE;
            | 32,33:  showsummary := FALSE;
            | 34,35:  outputnow   := TRUE;
            | 36,37:  doanalysis  := TRUE;
            | 38:     DEBUG       := TRUE;
            ELSE
                abort(errOption,S); (* could be errHelp, eh eh ! *)
            END;
        ELSE
            CASE state OF
            | waiting : Str.Copy(spec,S);
            | gotspec : abort(errTooManyParms,S);
            END;
            INC (state);
        END;
    END;
    IF state=waiting THEN abort(errMissingSpec,"");END;

    fixdirspec(spec);
    readMatching(spec,lastfile);
    IF lastfile > maxfile THEN abort(errTooManyMatches,spec);END;
    IF lastfile < firstfile THEN abort(errNoMatch,spec);END;
    buildbase(spec,base);

(* ------------------------------------------------------------ *)

    IF doanalysis THEN
        IF lastfile > firstfile THEN abort(errJoker,spec);END;
        Str.Concat(path,base,filename[firstfile]);
        i:=parseLines(R, path);
        IF i # errNone THEN abort(i,R);END;
        dumpFreqRemade(showfreq);
        abort(errNone,"");
    END;

(* ------------------------------------------------------------ *)

    IF wordmode THEN
        IF lowerlen # undefined THEN abort(errMode,"");END;
        IF upperlen # undefined THEN abort(errMode,"");END;
        IF exactlen # undefined THEN abort(errMode,"");END;
    END;

    IF lowerlen = undefined THEN lowerlen := mingram; END;
    IF upperlen = undefined THEN upperlen := mingram+1; END;
    IF exactlen = undefined THEN
        IF upperlen < lowerlen THEN
            i:=lowerlen;
            lowerlen:=upperlen;
            upperlen:=i;
        END;
    ELSE
        lowerlen:=exactlen;
        upperlen:=exactlen;
    END;
    IF (allowdigits AND useblanks) THEN abort(errNonsense,"");END;

    dumpFlags (showflags,
              keepcase,keepaccents,keepae,useblanks,allowdigits,
              wordmode,cumulative,outputnow,lowerlen,upperlen,spec);

    IF DEBUG THEN showmem( "enter");END;

    IF outputnow THEN verbose:=FALSE; END;

    IF cumulative THEN initTree();initCounters(FALSE  ,FALSE);END;

    FOR i := firstfile TO lastfile DO
        Str.Concat(path,base,filename[i]);
        IF NOT(cumulative) THEN
            WrLn;WrStr(hdr+"File : ");WrStr(path);WrLn;
            initTree();initCounters( FALSE , FALSE);
        END;

        buildGrams (errcode,einfo,
                   lowerlen,upperlen,
                   keepcase,keepaccents,keepae,useblanks,allowdigits,
                   wordmode,outputnow,verbose,
                   path);
        IF errcode # errNone THEN abort(errcode,einfo);END;

        IF DEBUG THEN showmem( path ); END;

        IF NOT(cumulative) THEN
            WrLn;
            IF NOT(outputnow) THEN
                initCounters(TRUE ,showlist);
                dumpTree(errcode); IF errcode#errNone THEN abort(errAborted,"");END;
                dumpFreq(showfreq);
            END;
            dumpStats(showsummary,wordmode,outputnow);
            IF outputnow THEN
                initCounters(TRUE ,showlist);
            END;
            killTree();
            IF DEBUG THEN showmem( "tree killed");END;
        END;
    END;

    IF cumulative THEN
        WrLn;
        IF NOT(outputnow) THEN
            initCounters(TRUE ,showlist );
            dumpTree(errcode); IF errcode#errNone THEN abort(errAborted,"");END;
            dumpFreq(showfreq);
        END;
        dumpStats(showsummary,wordmode,outputnow);
        killTree();
    END;
    IF DEBUG THEN showmem( "exit");END;

    abort(errNone,"");
END wCount.
