%%%% TITLE: TALLY.STY, Laurent Siebenmann 5-93, alpha version %%% --- a portable TeX tool for counting %%% paragraphs, lines, words, syllables, letters %%% MASTER POSTING: ftp matups.matups.fr in directory TeX/TeXTyping %%% BUGS: To lcs@matups.matups.fr %%% DOCUMENTATION: below, with more after \endinput %%% %%% SYNTAX : e.g. for word count %%% \WordCount \endWordCount, OR, for LaTeX (only), %%% \begin{WordCount}\end{WordCount} %%% %%% FUNCTION: Tells number of words (etc) in the by %%% switching to a simple one-syllable-per-page style and %%% counting calls to a degenerate output routine. %%% %%% FEATURES %%% Hopefully for all formats. %%% Portable: uses TeX3.xx, nothing more (\LetterCount excepted) %%% Extras: %%% \LineCount \endLineCount %% counts lines %%% \SyllableCount \endSyllableCount %% counts syllables %%% \LetterCount \endLetterCount %%% Counts letters (and digits). %%% It requires special format (see after \endinput). %%% \AllCounts \endLetterCount gives all 4. %%% \AllCounts+ \endLetterCount %%% All 4 with math, punctuation, paragraph suppression %%% plus deductions. Slow. LaTeX syntax is not available. %%% %%% ADVICE AND CAUTIONS: %%% --- The should contain the whole of %%% any group environment etc. that it meets. %%% --- Learn what "word" means in practice. %%% "Jean-Pierre" is two words; J.~Bond is one. %%% \centerline{Bla bla bla ...} is one, as are %%% $$ and $$$$ and \hbox{} %%% and \vbox{}. A texpert can change this somewhat. %%% --- \WordCount \endWordCount %%% will crudely rip from your document, so %%% comment out "\WordCount" and "\endWordCount" etc. %%% in production runs. %%% --- To conserve TeX capacity when counts are not in use %%% also comment out the input command for tally.sty. %%%% Prelims: \ifx\undefined\WordCountAt \else \def\temp{\immediate\write16{}\immediate\write16{% tally.sty already loaded\string!}\endinput} \expandafter\temp \fi % \chardef\WordCountAt=\catcode`\@ \catcode`\@=11 %%temporarily suppress Plain's logging of allocations \let\TY@wlog\wlog \def\wlog#1{\relax} % %%% \LE for letter counts. %% Syntax \begingroup \LE \endgroup please! %% Makes 0--9 and = counted letters. Nearly Pascal convention! %% Other punctuation not. \def\LE{% \ifx\undefined\l@LE \immediate\write16{}% \immediate\write16{ \string!\string!\string! Correct LetterCount requires special patterns.} \immediate\write16{ \string!\string!\string! See tally.sty internal documentation.}% \else \language=\l@LE \lefthyphenmin=0 \righthyphenmin=0 \lccode`0=`0 \lccode`1=`1 \lccode`2=`2 \lccode`3=`3 \lccode`4=`4 \lccode`5=`5 \lccode`6=`6 \lccode`7=`7 \lccode`8=`8 \lccode`9=`9 \lccode`==`= \fi } \def\QuietCounts{\let\CountPause\relax} \let\CountPause\show %% Conserve count registers by local use. \countdef\TY@cnt=190 \countdef\TY@lefthyphenmin=195 \countdef\TY@righthyphenmin=196 \countdef\TY@cnti=200 \countdef\TY@cntii=201 \countdef\TY@accentcnt=202 \countdef\TY@Lettercnt=203 \countdef\TY@Syllablecnt=204 \countdef\TY@Wordcnt=205 \countdef\TY@Linecnt=206 \countdef\TY@Parcnt=207 \countdef\TY@SyllableLen=208 \countdef\TY@Gaps=209 \countdef\TY@LLcnt=210 %% Conserve dimen registers by local use \dimendef\TY@hsize=200 \dimendef\TY@everypar=201 \dimendef\TY@baselineskip=202 \dimendef\TY@lineskiplimit=203 \dimendef\TY@parskip=204 \dimendef\TY@ChWd=205 %%%% Gen@Count: the main macro. %% will always be surrounded by grouping \def\Gen@Count{% %%% Start \vfil\par\break\let\Gen@Count\undefined \let\endGen@Count\endGen@@Count %%% Impose a suitable language \SetTY@language %%% Horizontal changes \SetTY@everypar \SetTY@hsize \hbadness=10000 \hfuzz=\maxdimen %\pretolerance=100000 %% Alas no effect! %%% Vertical changes \topskip=0pt plus \vsize \baselineskip=\vsize\relax %%% Output changes \maxdeadcycles=2000000000 %% two billion words at most! \csname output\endcsname{\global\setbox255=\box\voidb@x}% %%% Initialize \deadcycles=0} \def\endGen@@Count{%% defined \endGen@Count in time \vfil\par\break \errorcontextlines=0 \immediate\write16{\space *** \Count@Mode\space= \the\deadcycles\space\TY@Msg}% \CountPause\CountPause %% gives pause \global\TY@cnt=\deadcycles \deadcycles=0} %%% Some defaults \def\TY@Msg{(starting at input line \the\inputlineno) ***}% \def\SetTY@hsize{\hsize=\z@\relax \let\hsize\TY@hsize} %% uproot! \def\SetTY@language{}% \def\SetTY@everypar{\everypar\expandafter{\the\everypar\ } %% "\ " is glue allowing hyphenation of 1st word \let\everypar\TY@everypar %% uproot! } \def\TY@Safe@Acc#1#2{% \if i#2% #2\csname @#1@\endcsname \i \else #2\csname @#1@\endcsname #2\empty \fi} %\def \TY@CaesarFix{% % \ifx\@Acc\Safe@Acc\let\@Acc\TY@Safe@Acc\fi % \let\Safe@Acc\TY@Safe@Acc % \JDaccents\let\noJDaccents\relax} \def \TY@CaesarFix{% \ifx\JDaccents\undefined \else \DSaccents\fi} %%% \LetterCount: \def\LetterCount{\begingroup \TY@CaesarFix %% fixes CaesarXX and HyAcc-CM \let\accent\TY@accentcnt %% fixes Knuth \def\Count@Mode{LetterCount}% \def\SetTY@language{% \LE %% then uproot various things in it \let\lefthyphenmin\TY@lefthyphenmin %% uproot \lefthyphenmin \let\righthyphenmin\TY@righthyphenmin %% uproot \righthyphenmin \let\language\TY@language }% \Gen@Count} % \def\endLetterCount{\endGen@Count\endgroup} %%% \SyllableCount: Simple! \def\SyllableCount{\begingroup \def\Count@Mode{SyllableCount}% \Gen@Count} %% language variation OK!? \def\endSyllableCount{\endGen@Count\endgroup} \newlanguage\l@Unitas %% hopefully saves time? %%% \WordCount: \def\WordCount{\begingroup \def\Count@Mode{WordCount}% \def\SetTY@language{\language=\l@Unitas \lefthyphenmin=10000 %% enough to stop hyphenation \let\lefthyphenmin\TY@lefthyphenmin %% uproot! \let\language\TY@language %% uproot! } \Gen@Count} \def\endWordCount{\endGen@Count\endgroup} %%% \LineCount: Simple! \def\LineCount{\begingroup \def\Count@Mode{LineCount}% \def\SetTY@hsize{}% \Gen@Count} \def\endLineCount{\endGen@Count\endgroup} %%% \ParCount: \def\ParCount{\begingroup \def\Count@Mode{ParCount}% \hsize=.5\maxdimen \baselineskip=\z@ \let\baselineskip\TY@baselineskip %% uproot \lineskiplimit=-\maxdimen \let\lineskiplimit\TY@lineskiplimit %% uproot \parskip=2\vsize \let\parskip\TY@parskip %% uproot \Gen@Count} \def\endParCount{\endGen@Count\endgroup} \def\kill@punct{% \catcode`\.=10 \catcode`\,=10 \catcode`\;=10 \catcode`\:=10 \catcode`\!=10 \catcode`\?=10 } \def\kill@math#1${$\ignorespaces} \def\kill@display#1$${$$} \def\kill@par{\catcode"D=10\relax} %%% \AllCounts \def\AllCountsPlus@Extras{% %%% Mean syllable length \TY@cnti=\TY@Lettercnt \multiply\TY@cnti by 10 \divide\TY@cnti by \TY@Syllablecnt \immediate\write16{}% \immediate\write16{ *** Mean syllable length = \the\TY@cnti\space char tenths}% \TY@SyllableLen=\TY@cnti %%% Mean word length \TY@cnti=\TY@Lettercnt \multiply\TY@cnti by 10 \divide\TY@cnti by \TY@Wordcnt \immediate\write16{ *** Mean word length = \the\TY@cnti\space char tenths}% %%% Interword spaces per line \TY@cnti=\TY@Wordcnt\relax\count255=10 \multiply\TY@cnti by \count255 \divide\TY@cnti by \TY@Linecnt \multiply \count255 by -1 \advance\TY@cnti by \count255 \immediate\write16{ *** Interword spaces per line = \the\TY@cnti\space tenths}% \TY@Gaps=\TY@cnti %%% Mean line length \TY@cnti=\TY@Lettercnt\relax \TY@cntii=\TY@Gaps\relax \divide\TY@cnti by \TY@Linecnt \TY@cnt=5\relax\advance\TY@cntii by \TY@cnt\relax \TY@cnt=10\relax\divide\TY@cntii by \TY@cnt\relax \advance\TY@cnti by \TY@cntii\relax\TY@LLcnt=\TY@cnti \immediate\write16{ *** Mean line length = \the\TY@LLcnt\space chars}% %%% Mean character width \TY@cnti=\TY@LLcnt\relax\dimen8=\hsize \divide\dimen8 by \TY@cnti\relax\TY@ChWd=\dimen8\relax \immediate\write16{ *** Mean character width = \the\TY@ChWd\space (for \string\rm)}% \multiply\dimen8 by \TY@SyllableLen\relax \TY@cnti=10\relax\divide \dimen8 by \TY@cnti \relax \immediate\write16{ ***** Suggested \string\emergencystretch \space= \the\dimen8}% %%% Strain per interword space \TY@cnti=\TY@SyllableLen\relax \TY@cnt=50 \multiply \TY@cnti by \TY@cnt \ifnum\TY@Gaps>0\relax\divide\TY@cnti by \TY@Gaps\relax \else\TY@cnti=10000\relax \fi \immediate\write16{ *** Strain per interword space = \the\TY@cnti\space char hundredths}% %%% Badness of mean interword strain \multiply \TY@ChWd by \TY@cnti \rm\dimen0=\fontdimen3\font\relax \divide\TY@ChWd by \dimen0 \TY@cnt=\TY@ChWd\relax \TY@cntii=100 \multiply\TY@cnt by \TY@ChWd\relax\divide\TY@cnt by \TY@cntii \multiply\TY@cnt by \TY@ChWd\relax\divide\TY@cnt by \TY@cntii \immediate\write16{ *** Badness of mean interword strain (for \string\rm) \the\TY@cnt}% \TY@cntii=8 \multiply \TY@cnt by \TY@cntii\relax\TY@cnti=100 \ifnum\TY@cnt<\TY@cnti\relax\TY@cnt=100\fi \immediate\write16{ ***** Suggested \string\hbadness \space = \the\TY@cnt\space for present \string\hsize=\the\hsize}% \immediate\write16{}% } \def\AllCountsPlus{\begingroup \kill@punct \parindent=\z@\kill@par \everymath{\kill@math}% \everydisplay{\kill@display}% \let\AllCounts@Extras\AllCountsPlus@Extras \immediate\write16{ **** AllCounts+ starting at input line \the\inputlineno \space ***** }% \AllCounts@\gobble } \def\gobble#1{} \def\AllCounts@Extras{} %% default \def\AllCounts{\futurelet\next\AllCountsMole} \def\AllCountsMole{\ifx\next+\expandafter\AllCountsPlus \else\begingroup\expandafter\AllCounts@\fi} \long\def\AllCounts@#1\endAllCounts{% \def\CountPause{}% \def\TY@Msg{***}% \immediate\write16{}% \LetterCount#1\endLetterCount \TY@Lettercnt=\TY@cnt \SyllableCount#1\endSyllableCount \TY@Syllablecnt=\TY@cnt \WordCount#1\endWordCount \TY@Wordcnt=\TY@cnt \LineCount#1\endLineCount \TY@Linecnt=\TY@cnt \ParCount#1\endParCount \TY@Parcnt=\TY@cnt \AllCounts@Extras \endgroup \CountPause\CountPause } \let\wlog\TY@wlog \catcode`\@=\WordCountAt \endinput %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Simple test %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\documentstyle{article} %% for LaTeX %\begin{document} %% for LaTeX %\input tally.sty %% for Plain and her children aaaaaaaaaaaaaaaaaaaaaaaa \def\Hii{Hi! Hi! Hi! Hi! %\footnote{*}{Foot foot foot} Hi! Hi! Hi! Hi! Hi! Hi! $x=y=z ... I would like to know if there is a possibility to > count the words in a LaTeX document... Philip Taylor, RHBNC replied: > Use DVIspell from the emTeX suite [of E. Mattes], then > count the resulting words using your favourite word counter. DVIspell is one of the recent miracles of TeX; it requires careful parsing of the .dvi file. Yannis Haralambous suggested what is potentially an ad hoc simplification: > A weird answer (which could very well fit in a Lovecraft > novel):... > PS. if anyone ever tries this method, let me know please Yannis' idea is to use and ligature/kerning to insert a tagging character to mark each beginning-of-word in the .dvi file. As neither solution is very portable, I wish to propose a completely portable solution tally.sty based on TeX's line-breaking and \output. tally.sty also provides paragraph counts, line counts syllable counts, and letter counts. The syllable counts are *not* available via tools based on the ".dvi" file. The full spectrum of counts help one evaluate and tune hyphenation performance, so several relevant auxiliary calculations are made. I have a feeling this is a genuinely useful gadget. Enjoy! Laurent Siebenmann Notes. (i) tally.sty bore the name wordcnt.sty for the first few weeks of its existence. (ii) The present name tally.sty comes from Latin talea for stick. Notched sticks were used for counting and accounting; lengthwise splitting of the notched stick provided "carbon" copies. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% **** SPECIAL FORMAT COMPILATION FOR LETTER COUNTING **** The letter counting macro \LetterCount \endLetterCount requires the special language LE in which each LEtter and each digit is a syllable. You have to add it at format compilation time with INITEX. Near where other hyphenation files are \input, it should suffice to add to your INITEX script the three lines: \newlanguage\l@LE \language=\l@LE \input lehyphen.tex %% a very small patterns file %% appended to this file The crudest way is to add these lines to plain.tex after: input hyphen.tex Since the LE compiled hyphenation patterns are tiny (280 bytes), it is perfectly reasonable to incorporate them into standard formats. In case the LE patterns are needed but *not* present, tally.sty will issue a warning message. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% **** \AllCounts+ used to evaluate and tune hyphenation performance **** There are many hyphenation systems for Euopean languages freely available on the CTAN archives, and there are often several for the same language. Which is adequate? Which is best, all things considered? How can a given one be tuned to a given task? The average length of syllables in a text is a good measure of performance of a hyphenation system. Indeed this length divided by two is the average stretch that will be imposed on a justified line --- with the important proviso that devation from the mean syllable length is negligible. The formula to calculate this average is #G/#S where #G is the number of letters (Glyphs) and #S is the number of syllables. The quality of the hyphenation is also dependant on the average word length and this is is chiefly a function of the language and of the nature of the text. The reason for this dependance is that the stretchiness present in a line is roughly the interword stretch multiplied by [N-] where N is the number of words in the line. Here [N-] is the greatest integer strictly less than N. (This is for a justified text.) Thus, the average stretching imposed on a space in a justified line is roughly proportional to both average syllable length and average word length but inversely proportional to the line length. The product of the first two is a measure of the expected badness of the text with regards to line breaking; it is independant of the typesetting and even of the order of the words. And this quantity divided by line length in characters (see below) is approximately the strain on an a single interword space --- a useful measure of the expected badness of the linebreaking. A typographer-designer may wish to calculate these measures of badness, and contrive to keep them below some limit values that experience suggests. Line length, font, magnification, and hyphenation system are the main control paramaters he can vary. The above discussion neglects punctuation and the spacing around it, hopefully a second order effect. The syntax \AllCounts \endAllCounts shows all four counts. There is a further option \AllCounts+ \endAllCounts that provides helpful extras. Before using \AllCounts+, best clean out unusual material like titles tables inserts rules from a representative page or two of prose. If the paragraph count is then >1 the cleanup has been less than perfect, but that may not matter. \AllCounts+ forgets all math and displays and punctuation and paragraphs. Only those paragraphs indicated by blank lines are successfully suppressed. \AllCounts+ calculates mean syllable length in characters, and the mean number of wordbreaks per line, plus a few other derived quantities. One of them is the number of characters per line. The space between words is counted as a character; this makes the calculated value more nearly constant when word length varies. Another is Knuth's badness for the strain on interword space that would occur for syllables of mean size. Knuth normalises this badness to be 100 when strain equals stretchiness; it then increases as the third power of the strain (thus eightfold when strain doubles). A setting for \hbadness is proposed by \AllCounts+ that will assure that a warning is logged for precisely those lines in which strain exceeds TWICE the mean stretchiness. \emergencystretch is the quantity of stretchiness that TeX adds to renormalise its linebreak optimisation mechanism in case the arithmetic goes off scale. A setting for \emergencystretch is suggested that is twice the strain that would be caused by syllables of mean length. A major factor in hyphenation has escaped us. Occasional long syllables have a disproportionate ill effect on hyphenation. Standard deviation from the mean might evaluate this; but tally.sty does not provide any measure of this deviation. However with the suggested settings of \hbadness and \emergencystretch one can hope to resort to ad hoc measures whenever a line creates an underfull \hbox warning. This is as Knuth intended. The reasonable settings suggested by tally.sty thus help make Knuth's concepts work. Summary for \AllCounts+ : The quantities calculated are helpful in choosing a hyphenation system, and in choosing a page design parameters (notably \hsize and \magnification). They may also help an author keep his manuscript within prescribed limits. Finally, they should help choose reasonable values of \emergencystretch and \hbadness to facilitate ad hoc improvements in response to underfull hboxes. *** Hyphenation systems for specific tests. HyAcc-CM.tex CaesarCM.tex CaesarCK.tex are tuned to several sorts of font. You will be able to see quite different performances depending on both the font and the macro settings. The format compilation mechanism format-dumper-xx (xx=cm or ck) compiles corresponding formats. All of these are found on ftp matups.matups.fr and/or the CTAN archives. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% **** LIMITATIONS AND REMEDIES **** (A) Boxes: tally.sty messes up counting within boxes: a whole box of words will usually be counted as one or none. This is the case for \centerline{My Title} There are usually ad hoc remedies --- in this case try: \let\centerline=\ Many cases of this problem are well disguised by macros. In amsppt.sty bibliographies for example, one gets roughly the number of entries rather than the count requested. Again there is an ad hoc fix. (B) Inserts: A somewhat different problem is posed by inserts, in particular footnotes. Inserts are excluded from the count and are stored up and printed after the count is complete. Try \def\insert#1{\ } to change this. (C) Math: If you do not want each block of math to add one to the count try \def\gobblemath#1${$} \everymath{\gobblemath} Similarly for displays. See \AllCounts+. (E) Initial words (a solved problem!): Normally TeX does not attempt to hyphenate the first word of a paragraph, since it looks foreward from glue items to find words to hyphenate, [TeXBook page 454]. Parindent is, as it turns out, not a suitable starting point, so the first word of a paragraph is normally not hyphenated! Thus the counting macros add some glue to the right end of of the \everypar token list. Drastic measures have been taken to prevent TeX macros from redefining \everypar and thus destroying this useful glue: the \everypar syntax is "uprooted", ie detached from primitive meaning and temporarily attached to a "meaningless" token register. Remark: This "uprooting" device is much used, for example to protect the typescript from commanding a \language change while a \LetterCount is under way. It may be the source of bugs still to be encountered, notably where applied to \hsize, and \baselineskip. However \par seems "non-uprootable", perhaps because it is secretly cited by name in the execution of other primitives of TeX. (F) The automatic cleanup provided by \AllCounts+ intends to get rid of everything but words --- in order to evaluate linebreaking for plain prose; but it is imperfect. (a) The punctuation is given category 10 (=space). That means that punctuation in predefined macro expansions does not evaporate. (b) The carriage return is also given category 10 (=space). (Recall that \par cannot be "uprooted".) Thus paragraphs implicit in (sub)titles of various sorts are not eliminated. Add these to the list of stuctures you may wan to clean out by ad hoc means. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%% lehyphen.tex %% LEtter count hyphenation patterns %% for use with word tally.sty %% to let \LetterCount\endLetterCount count the letters \begingroup \catcode`@=11 %%% Globally set high ASCII to 12 %% \count@="80 \loop \global\catcode\the\count@=12 \ifnum\count@<"FF \relax\advance\count@ by 1 \repeat %%% Globally set their Cork default lc-uc codes \newcount\Ct@ \newcount\Ct@@ \newcount\Ct@@@ \toks0={% \loop \Ct@=\count@\advance\Ct@ by 32 %% \count@ is Plain scratch counter \count255 \global\uccode\the\count@=\the\count@ \global\lccode\the\count@=\the\Ct@ \global\uccode\the\Ct@=\the\count@ \global\lccode\the\Ct@=\the\Ct@ \ifnum\count@<\Ct@@ \relax\advance\count@ by 1 \repeat } \count@="80 \Ct@@="9E \the\toks0 %% segment "80 to "BF %% "9F is \S and "BF is pound \count@="C0 \Ct@@="DF \the\toks0 %% segment "C0 to "FF \lccode"BD=0\uccode"BD=0 %% !! \lccode"BD=0\uccode"BD=0 %% ?? %% count figures as letters!? \lccode`0=`0 \lccode`1=`1 \lccode`2=`2 \lccode`3=`3 \lccode`4=`4 \lccode`5=`5 \lccode`6=`6 \lccode`7=`7 \lccode`8=`8 \lccode`9=`9 \lccode`==`= \patterns{% 1^^301 1^^311 1^^321 1^^331 1^^341 1^^351 1^^361 1^^371 1^^381 1^^391 1<1 1=1 1>1 1a1 1b1 1c1 1d1 1e1 1f1 1g1 1h1 1i1 1j1 1k1 1l1 1m1 1n1 1o1 1p1 1q1 1r1 1s1 1t1 1u1 1v1 1w1 1x1 1y1 1z1 } \patterns{% %1^^9D1 %% Turkish dot I 1^^9E1 %% croatian \dj %1^^9F1 %% \S % 1^^a01 1^^a11 1^^a21 1^^a31 1^^a41 1^^a51 1^^a61 1^^a71 1^^a81 1^^a91 1^^aa1 1^^ab1 1^^ac1 1^^ad1 1^^ae1 1^^af1 % 1^^b01 1^^b11 1^^b21 1^^b31 1^^b41 1^^b51 1^^b61 1^^b71 1^^b81 1^^b91 1^^ba1 1^^bb1 1^^bc1 %1^^bd1 %% !! %1^^be1 %% ?? %1^^bf1 %% pound % 1^^e01 1^^e11 1^^e21 1^^e31 1^^e41 1^^e51 1^^e61 1^^e71 1^^e81 1^^e91 1^^ea1 1^^eb1 1^^ec1 1^^ed1 1^^ee1 1^^ef1 % 1^^f01 1^^f11 1^^f21 1^^f31 1^^f41 1^^f51 1^^f61 1^^f71 1^^f81 1^^f91 1^^fa1 1^^fb1 1^^fc1 1^^fd1 1^^fe1 1^^ff1 } \endgroup %%% end of lehyphen.tex for tally.sty