#!/usr/bin/env perl
##!/usr/bin/perl -w
# latexdiff - differences two latex files on the word level
# and produces a latex file with the differences marked up.
#
# Copyright (C) 2004-22 F J Tilmann (tilmann@gfz-potsdam.de)
#
# Repository/issue tracker: https://github.com/ftilmann/latexdiff
# CTAN page: http://www.ctan.org/pkg/latexdiff
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# Detailed usage information at the end of the file
#
#
# Note references to issue numbers are for the github repository of latexdiff: https://github.com/ftilmann/latexdiff
#
# Version 1.3.4:
# New features:
# - Option to use lua-ul instead of ulem (for use with LuaLaTeX) (fix #17, #60, #188, #255, #270)
# Enhancement
# - If amsmath is detected as one of the included (based on idea in PR #263 contributed by github user xlucn
# Bug fix
# - sometimes the introduction of aux commands to show deleted lists or descriptions leaves in place empty auxiliary
# list environments, which would cause error messages (though skipping messages would result in a correct output pdf).
# These are now removed.
# - add more mboxsafecmd and safecmd commands for SIunitx to stay compatible with newer versionsof SIunitx (PR #283, fixing issue #282, contributed by github user joe6302413)
# - File added via --preamble option is no longer assumed to be ASCII, but read either with encoding as defined by --encoding option or using the encoding of the LOCALE (fixes issue #285 )
# - multicolumn argument is now treated as text
# - when tikz-dependency package is used, \& is no longer a safe command as it has special meaning inside dependency environment. The fix is a little of a hack as really it should only be considered unsafe within dependency environment (fixes (mostly) issue #303 )
# - listings package had trouble with non-ASCII chars. The encoding is now set (thanks to github user anka-213 for finding this). Fixes #304
#
# Version 1.3.3:
# New features:
# - Option --no-del to remove all deleted text (merge contributed by github user tdegeus PR #252, fixing issue #66
#
# Bug fixes:
# - Abbreviations involving punctuations within them. They need special treatment because otherwise in some
# circumstances the gnoring of white space differences in conjunction with merging according to MINWORDSBLOCK rule
# could turn 'i.e.' into 'i.\PAR e.' (see https://github.com/ftilmann/latexdiff/issues/269). A few abbreviations
# are now hard-coded and treated as atomic:
# English: i.e., e.g. Deutsch: z.B.
# (fixes issue #269)
# - In WHOLE and COARSE math modes, now properly treat math environments with arguments such as \alignat. Fixes #251
# - For FINE math mode, multiple improvments to the processing work flow yield more robust outcomes. In particular, changes
# to the equation type, e.g. \begin{displaymath} -> \begin{equation} without modifications now usually no longer result
# in errors. (Partially) fixes issues #235 and #244.
# - When encountering deleted math array environments such as align or eqnarray, rather than replacing them with a
# fixed replacement environment (e.g. align* or eqnarray*), an asterisk is now added to the original command, which
# in amsmath (and with eqnarray) will result in the same environment but without line numbers. Config variable MATHARRREPL
# is therefore (nearly) redundant, and a depracation warning is given when it is set. Reference to MATHARRREPL are have
# been removed from the manual (there is one exception, when it's still being used: sometimes latexdiff can figure out
# that there is a deleted array environment, but does not know which one. In this case, MATHARRREPL is still being used
# to encapsulate these parts of the source, and therefore it is still set internally. But this is a quite rare situation).
# Fixes issue #216
# - Unlike 'array' environment, 'split' (amsmath) does not work in argument of \DIFadd or \DIFdl in UNDERLINE modes; therefore remove it from ARRENV configuration variable.
# Exclude \begin and \end in math environments in COARSE and WHOLE modes. Fixes #258. Fixes #109
# - --flatten now works for empty files. Fixes issue #242
# - improved processing of Chinese and Japanese texts in that splitting is done based on characters. Thanks to LuXu (Oliver Lew) in git for working this out. Fixes #229, fixes #145
# Version 1.3.2
# API adaptions:
# - latexdiff now completes with exit code 0 after --help or --version command (see issue #248)
# New features / feature extensions
# - extend CUSTOMDIFCMD related postprocessing to deal properly with multiline commands, or a sequence of several commands in the same line (see github issue #204)
# - Support for additional macros from import package (\import, \inputfrom, \includefrom, \subimport,\subinputfrom, \subincludefrom). Provided by janniklasrose in PR #243 (fixes #239)
# - replace default driver dvips->pdftex
# Bug fixes:
# - fix issue #206 affecting proper markup of text commands which are not also safe cmd's and have multiple arguments
# - fix issue #210 by adding \eqref (amsmath package) to the list of safe commands
# - fix bug reported in issue #168 mangled verbatim line environment
# - fix bug reported in issue #218 by replacing \hspace{0pt} after \mbox{..} auxiliary commands with \hskip0pt.
# - more ways to process \frac correctly with atomic arguments (committed by julianuu PR #246
# - fix a bug in biblatex mode, which prevented proper processing of modified \textcite (see: https://tex.stackexchange.com/questions/555157/latexdiff-and-biblatex-citation-commands)
# - -h string fix: add -driver option
#
# Version 1.3.1.1
# - remove spurious \n to fix error: Unknown regexp modifier "/n" at .../latexdiff line 1974, near "=~ " (see github issue #201)
#
# Version 1.3.1
# Bug fixes:
# - remove some uninitialised variable $2 warnings in string substitution in flatten function in case included file is not found
# - add minimal postprocessing to diff processing of preamble commands (replace \RIGHTBRACE by \} )
# - pre-processing: replace (contributed) routine take_comments_and_enter_from_frac() with take_comments_and_newline_from_frac(), which does the same thing
# (remove whitespace characters and comments between the argument of \frac commands) in an easier and more robust way. In addition, it
# will replace commands like \frac12 with \frac{1}{2} as pre-processing step. Fixes issue #184
# - add "intertext" to list of unsafe math commands @UNSAFEMATHCMD . Fixes issue #179
# - provide citation command patterns for biblatex and protect them with mbox'es. Fixes issue #199
# - hardcode number of parameters for \href and \url commands to allow spaces between commands and arguments even if --allow-spaces option is not used (this
# is needed because some bibliography styles add such in-command-sequence spaces) Fixes issues: #178 #198
# - bibitem is now kept even in deleted blocks such that deleted references show up properly (this implies that the actual numbers in numerical referencing schemes will change)
# (this is implemented by introducing a new class of commands KEEPCMD , which are kept as is in deleted environments (no effect in added environments). Currently
# \bibitem is hardwired to be the only member of this class (fixes issue #194, #174)
# Features:
# - add some special processing for revtex bibliography commands, so that the spaces between bibliography commands \bibfield and \bibinfo and their arguments are ignored.
# (fixes issue #194, should fix #174)
#
# Version 1.3.0 (7 October 2018):
# - treat options to \documentclass as potential package names (some packages allow implicit loading of or imply selected packages
# - improved pattern matching: now allows nested angular brackets, and is no longer confused by escaped curly braces
# - improved pattern matching in COARSE mode: occasionally, the closing bracket or some other elements would be matched in an 'unnatural' way due to another sequence being more minimal in the computational sense, sometimes even causing errors due to tokens moving in or out of the scope of math environments. This is now discouraged by adding internal \DIFANCHOR commands (which are removed again in post-processing) (fixes issues reported via email by li_ruomeng .
# - verbatim and lstlisting environments are marked-up with line-by-line in a similar style to non-verbatim text (requires the listing package to be installed)
# (see new configuration variable VERBATIMLINEENV) (several issues and pull requests by jprotze)
# - --flatten: now supports \verbatiminput and \lstlistinput
# - --flatten: if file is not found, do not fail, simply warn and leave command unexpanded (inspired by issue #112). Don't warn if file name contains #[0-9] as it is then most likely an argument within a command definition rather than an actual file (applies to \input, \subfile, \include commands)
# - added to textcmds: \intertext
# - new config variable CUSTOMDIFCMD to allow defining special versions of commands in added or deleted blocks (Pull request by github user jprotze)
# - added option -no-links (mostly for use by latexdiff-vc in only-changes modes) (Pull request by github user jprotze)
# - new option --filter-script to run both input through a pre-processing script (PR jasonmccsmith #167)
# new option --no-filter-stderr to hide stderr output from filter-script (potentially dangerous, as this might hide malfunctioning of filter scripts)
# - --flatten now can deal with imports made using the import package {PR jasonmccsmith #173)
# Bug fixes:
# - pattern matching of \verb and \lstinline commands had an error which meant they would trigger on commands beginning with \verb.
# - In description environments, mark up item descriptions by effectively reating the insides of item commannds as text commands (fixes #161)
#
#
# Version 1.2.1 (22 June 2017)
# - add "DeclareOldFontCommand" to styles using \bf or \sf old style font commands (fixies issue #92 )
# - improved markup: process lstinline commands in listings package correctly
# for styles using colour, \verb and \lstinline arguments are marked up with colour (blue for added, red for deleted)
# - bug fix: protecting inline math expressions for mbox did not work as intended (see stack exchange question: http://tex.stackexchange.com/questions/359412/compiling-the-latexdiff-when-adding-a-subscript-before-a-pmatrix-environment-cau)
# - bug fix: when deleted \item commands are followed immediately by unsafe commands, they were not restored properly
# (thanks to J. Protze for pull request) (pull request #89)
# - treat lstlisting and comment as equivalent to verbatim environment
# make environments that are treated like verbatim environments configurable (config variable VERBATIMENV)
# treat lstinlne as equivalent to verb command
# partially addresses issue #38
# - refactoring: set default configuration variables in a hash, and those that correspond to lists
# - feature: option --add-to-config used to amend configuration variables, which are regex pattern lists
# - bug fix: deleted figures when endfloat package is activated
# - bug fix: alignat environment now always processed correctly (fix issues #65)
# - bug fix: avoid processing of commands as potential files in routine init_regex_arr (fix issue #70 )
# - minimal feature enhancement: treat '@' as allowed character in commands (strictly speaking requires prior \makeatletter statement, but always assuming it to be
# @ a letter if it is part of a command name will usually lead to the correct behaviour (see http://tex.stackexchange.com/questions/346651/latexdiff-and-let)
# - new feature/bug fix: --flatten option \endinput in included files now respected but only if \endinput stands right at the beginning of the line (issue #77)
# - bug fix: flatten would incorrectly attempt to process commented out \include commands (from discussion in issue #77 )
# - introduce an invisible space (\hspace{0pt} after \mbox{..} auxiliary commands (not in math mode), to allow line breaks between added and deleted citations (change should not cause adverse behaviour otherwise)
#
# Version 1.2.0:
# - highlight new and deleted figures
# - bug fix in title mark-up. Previously deleted commands in title (such as \title, \author or \date) were marked up erroneously
# - (minor) bug fixes in new 1.1.1 features: disabled label was commented out twice, additional spaces were introduced before list environment begin and end commands
# - depracation fix: left brace in RegEx now needs to be escaped
# - add type PDFCOMMENT based on issue #49 submitted by github user peci1 (Martin Pecka)
# - make utf8 the default encoding
#
# Version 1.1.1
# - patch mhchem: allow ce in equations
# - flatten now also expands \input etc. in the preamble (but not \usepackage!)
# - Better support for Japanese ( contributed by github user kshramt )
# - prevent duplicated verbatim hashes (patch contributed by github user therussianjig, issue #36)
# - disable deleted label commands (fixes issue #31)
# - introduce post-processing to reinstate most deleted environments and all needed item commands (fixes issue #1)
#
# Version 1.1.0
# - treat diacritics (\",\', etc) as safe commands
# - treat \_ and \& correctly as safe commands, even if used without spacing to the next word
# - Add a BOLD markup type that sets added text in bold face (Contribution by Victor Zabalza via pull request )
# - add append-mboxsafecmd list option to be able to specify special safe commands which need to be surrounded by mbox to avoid breaking (mostly this is needed with ulem package)
# - support for siunitx and cleveref packages: protect \SI command in siunitx package and \cref,\Cref{range}{*} in cleveref packages (thanks to Stefan Pinnow for testing)
# - experimental support for chemformula, mhchem packages: define \ch and \ce in packages as safe (but not \ch,\cee in equation array environments) - these unfortunately will not be marked up (thanks to Stefan Pinnow for testing)
# - bug fix: packages identified correctly even if \usepackage command options extend over several lines (previously \usepackage command needed to be fully contained in one line)
# - new subtype ONLYCHANGEDPAGE outputs only changed pages (might not work well for floating material)
# - new subtype ZLABEL operates similarly to LABEL but uses absolute page numbers (needs zref package)
# - undocumented option --debug/--nodebug to override default setting for debug mode (Default: 0 for release version, 1: for development version
#
# Version 1.0.4
# - introduce list UNSAFEMATHCMD, which holds list of commands which cannot be marked up with \DIFadd or \DIFdel commands (only relevant for WHOLE and COARSE math markup modes)
# - new subtype LABEL which gives each change a label. This can later be used to only display pages where changes
# have been made (instructions for that are put as comments into the diff'ed file) inspired by answer on http://tex.stackexchange.com/questions/166049/invisible-markers-in-pdfs-using-pdflatex
# - Configuration variables take into accout some commands from additional packages:
# tikzpicture environment now treated as PICTUREENV, and \smallmatrix in ARRENV (amsmath)
# - --flatten: support for \subfile command (subfiles package) (in response to http://tex.stackexchange.com/questions/167620/latexdiff-with-subfiles )
# - --flatten: \bibliography commands expand if corresponding bbl file present
# - angled bracket optional commands now parsed correctly (patch #3570) submitted by Dave Kleinschmidt (thanks)
# - \RequirePackage now treated as synonym of \usepackage with respect to setting packages
# - special rules for apacite package (redefine citation commands)
# - recognise /dev/null as 'file-like' arguments for --preamble and --config options
# - fix units package incompatibility with ulem for text maths statements $ ..$ (thanks to Stuart Prescott for reporting this)
# - amsmath environment cases treated correctly (Bug fix #19029) (thanks to Jalar)
# - {,} in comments no longer confuse latexdiff (Bug fix #19146)
# - \% in one-letter sub/Superscripts was not converted correctly
#
# Version 1.0.3
# - fix bug in add_safe_commands that made latexdiff hang on DeclareMathOperator
# command in preamble
# - \(..\) inline math expressions were not parsed correctly, if they contained a linebreak
# - applied patch contributed by tomflannaghan via Berlios: [ Patch #3431 ] Adds correct handling of \left< and \right>
# - \$ is treated correctly as a literal dollar sign (thanks to Reed Cartwright and Joshua Miller for reporting this bug
# and sketching out the solution)
# - \^ and \_ are correctly interpreted as accent and underlined space, respectively, not as superscript of subscript
# (thanks to Wail Yahyaoui for pointing out this bug)
#
# Version 1.0.1 - treat \big,\bigg etc. equivalently to \left and
# \right - include starred version in MATHENV - apply
# - flatten recursively and --flatten expansion is now
# aware of comments (thanks to Tim Connors for patch)
# - Change to post-processing for more reliability for
# deleted math environments
# - On linux systems, recognise and remove DOS style newlines
# - Provide markup for some special preamble commands (\title,
# \author,\date,
# - configurable by setting context2cmd
# - for styles using ulem package, remove \emph and \text.. from list of
# safe commands in order to allow linebreaks within the
# highlighted sections.
# - for ulem style, now show citations by enclosing them in \mbox commands.
# This unfortunately implies linebreaks within citations no longer function,
# so this functionality can be turned off (Option --disable-citation-markup).
# With --enable-citation-markup, the mbox markup is forced for other styles)
# - new substyle COLOR. This is particularly useful for marking up citations
# and some special post-processing is implemented to retain cite
# commands in deleted blocks.
# - four different levels of math-markup
# - Option --driver for choosing driver for modes employing changebar package
# - accept \\* as valid command (and other commands of form \.*). Also accept
# \ (backslashed newline)
# - some typo fixes, include commands defined in preamble as safe commands
# (Sebastian Gouezel)
# - include compared filenames as comments as line 2 and 3 of
# the preamble (can be modified with option --label, and suppressed with
# --no-label), option --visible-label to show files in generated pdf or dvi
# at the beginning of main document
#
# Version 0.5 A number of minor improvements based on feedback
# Deleted blocks are now shown before added blocks
# Package specific processing
#
# Version 0.43 unreleased typo in list of styles at the end
# Add protect to all \cbstart, \cbend commands
# More robust substitution of deleted math commands
#
# Version 0.42 November 06 Bug fixes only
#
# Version 0.4 March 06 option for fast differencing using UNIX diff command, several minor bug fixes (\par bug, improved highlighting of textcmds)
#
# Version 0.3 August 05 improved parsing of displayed math, --allow-spaces
# option, several minor bug fixes
#
# Version 0.25 October 04 Fix bug with deleted equations, add math mode commands to safecmd, add | to allowed interpunctuation signs
# Version 0.2 September 04 extension to utf-8 and variable encodings
# Version 0.1 August 04 First public release
use Algorithm::Diff qw(traverse_sequences);
use Getopt::Long ;
use strict ;
use warnings;
use utf8 ;
use File::Spec ;
my ($algodiffversion)=split(/ /,$Algorithm::Diff::VERSION);
my ($versionstring)=< 3, # minimum number of tokens to form an independent block
# shorter identical blocks will be merged to the previous word
SCALEDELGRAPHICS => 0.5, # factor with which deleted figures will be scaled down (i.e. 0.5 implies they are shown at half linear size)
# this is only used for --dgraphics-markup=BOTH option
FLOATENV => undef , # Environments in which FL variants of defined commands are used
PICTUREENV => undef , # Environments in which all change markup is removed
MATHENV => undef , # Environments turning on display math mode (code also knows about \[ and \])
MATHREPL => 'displaymath', # Environment introducing deleted maths blocks
MATHARRENV => undef , # Environments turning on eqnarray math mode
MATHARRREPL => 'eqnarray*', # Environment introducing deleted maths blocks (note that now the starred varieties are being used, so this is only used to replace MATHMODE environments (where original environment is unknown)
ARRENV => undef , # Environments making arrays in math mode. The underlining style does not cope well with those - as a result in-text math environments are surrounded by \mbox{ } if any of these commands is used in an inline math block
COUNTERCMD => undef,
# COUNTERCMD textcmds which are associated with a counter
# If any of these commands occur in a deleted block
# they will be followed by an \addtocounter{...}{-1}
# for the associated counter such that the overall numbers
# should be the same as in the new file
LISTENV => undef , # list making environments - they will generally be kept
VERBATIMENV => undef, # Environments whose content should be treated as verbatim text and not be touched
VERBATIMLINEENV => undef, # Environments whose content should be treated as verbatim text and processed in line diff mode
CUSTOMDIFCMD => undef,# Custom dif command. Is defined in the document as a \DELcommand and \ADDcommand version to be replaced by the diff
ITEMCMD => 'item' # command marking item in a list environment
);
# Configuration variables: these have to be visible from the subroutines
my ($ARRENV,
$COUNTERCMD,
$FLOATENV,
$ITEMCMD,
$LISTENV,
$MATHARRENV,
$MATHARRREPL,
$MATHENV,
$MATHREPL,
$MINWORDSBLOCK,
$PICTUREENV,
$SCALEDELGRAPHICS,
$VERBATIMENV,
$VERBATIMLINEENV,
$CUSTOMDIFCMD
);
# my $MINWORDSBLOCK=3; # minimum number of tokens to form an independent block
# # shorter identical blocks will be merged to the previous word
# my $SCALEDELGRAPHICS=0.5; # factor with which deleted figures will be scaled down (i.e. 0.5 implies they are shown at half linear size)
# # this is only used for --graphics-markup=BOTH option
# my $FLOATENV='(?:figure|table|plate)[\w\d*@]*' ; # Environments in which FL variants of defined commands are used
# my $PICTUREENV='(?:picture|tikzpicture|DIFnomarkup)[\w\d*@]*' ; # Environments in which all change markup is removed
# my $MATHENV='(?:equation[*]?|displaymath|DOLLARDOLLAR)[*]?' ; # Environments turning on display math mode (code also knows about \[ and \])
# my $MATHREPL='displaymath'; # Environment introducing deleted maths blocks
# my $MATHARRENV='(?:eqnarray|align|alignat|gather|multline|flalign)[*]?' ; # Environments turning on eqnarray math mode
# my $MATHARRREPL='eqnarray*'; # Environment introducing deleted maths blocks
# my $ARRENV='(?:aligned|gathered|multlined|array|[pbvBV]?matrix|smallmatrix|cases|split)'; # Environments making arrays in math mode. The underlining style does not cope well with those - as a result in-text math environments are surrounded by \mbox{ } if any of these commands is used in an inline math block
# my $COUNTERCMD='(?:footnote|part|chapter|section|subsection|subsubsection|paragraph|subparagraph)'; # textcmds which are associated with a counter
# # If any of these commands occur in a deleted block
# # they will be succeeded by an \addtocounter{...}{-1}
# # for the associated counter such that the overall numbers
# # should be the same as in the new file
# my $LISTENV='(?:itemize|description|enumerate)'; # list making environments - they will generally be kept
# my $ITEMCMD='item'; # command marking item in a list environment
my $LABELCMD='(?:label)'; # matching commands are disabled within deleted blocks - mostly useful for maths mode, as otherwise it would be fine to just not add those to SAFECMDLIST
my @UNSAFEMATHCMD=('qedhere','intertext','begin','end'); # Commands which are definitely unsafe for marking up in math mode (amsmath qedhere only tested to not work with UNDERLINE markup) (only affects WHOLE and COARSE math markup modes). Note that unlike text mode (or FINE math mode0 deleted unsafe commands are not deleted but simply taken outside \DIFdel
my $MBOXINLINEMATH=0; # if set to 1 then surround marked-up inline maths expression with \mbox ( to get around compatibility
# problems between some maths packages and ulem package
# Markup strings
# If at all possible, do not change these as parts of the program
# depend on the actual name (particularly post-processing)
# At the very least adapt subroutine postprocess to new tokens.
my $ADDMARKOPEN='\DIFaddbegin '; # Token to mark begin of appended text
my $ADDMARKCLOSE='\DIFaddend '; # Token to mark end of appended text
my $ADDOPEN='\DIFadd{'; # To mark begin of added text passage
my $ADDCLOSE='}'; # To mark end of added text passage
my $ADDCOMMENT='DIF > '; # To mark added comment line
my $DELMARKOPEN='\DIFdelbegin '; # Token to mark begin of deleted text
my $DELMARKCLOSE='\DIFdelend '; # Token to mark end of deleted text
my $DELOPEN='\DIFdel{'; # To mark begin of deleted text passage
my $DELCLOSE='}'; # To mark end of deleted text passage
my $DELCMDOPEN='%DIFDELCMD < '; # To mark begin of deleted commands (must begin with %, i.e., be a comment
my $DELCMDCLOSE="%%%\n"; # To mark end of deleted commands (must end with a new line)
my $AUXCMD='%DIFAUXCMD' ; # follows auxiliary commands put in by latexdiff to make difference file legal
# auxiliary commands must be on a line of their own
# Note that for verbatim environment openings the %DIFAUXCMD cannot be placed in
# the same line as this would mean they are shown
# so the special form "%DIFAUXCMD NEXT" is used to indicate that the next line
# is an auxiliary command
# Similarly "%DIFAUXCMD LAST" would indicate the auxiliary command is in previous line (not currently used)
my $DELCOMMENT='DIF < '; # To mark deleted comment line
my $VERBCOMMENT='DIFVRB '; # to mark lines which are within a verbatim environment
# main local variables:
my @TEXTCMDLIST=(); # array containing patterns of commands with text arguments
my @TEXTCMDEXCL=(); # array containing patterns of commands without text arguments (if a pattern
# matches both TEXTCMDLIST and TEXTCMDEXCL it is excluded)
my @CONTEXT1CMDLIST=(); # array containing patterns of commands with text arguments (subset of text commands),
# but which cause confusion if used out of context (e.g. \caption).
# In deleted passages, the command will be disabled but its argument is marked up
# Otherwise they behave exactly like TEXTCMD's
my @CONTEXT1CMDEXCL=(); # exclude list for above, but always empty
my @CONTEXT2CMDLIST=(); # array containing patterns of commands with text arguments, but which fail or cause confusion
# if used out of context (e.g. \title). They and their arguments will be disabled in deleted
# passages
my @CONTEXT2CMDEXCL=(); # exclude list for above, but always empty
my @MATHTEXTCMDLIST=(); # treat like textcmd. If a textcmd is in deleted or added block, just wrap the
# whole content with \DIFadd or \DIFdel irrespective of content. This functionality
# is useful for pseudo commands \MATHBLOCK.. into which math environments are being
# transformed
my @MATHTEXTCMDEXCL=(); #
# Note I need to declare this with "our" instead of "my" because later in the code I have to "local"ise these
our @SAFECMDLIST=(); # array containing patterns of safe commands (which do not break when in the argument of DIFadd or DIFDEL)
our @SAFECMDEXCL=();
my @MBOXCMDLIST=(); # patterns for commands which are in principle safe but which need to be surrounded by an \mbox
my @MBOXCMDEXCL=(); # all the patterns in MBOXCMDLIST will be appended to SAFECMDLIST
my @KEEPCMDLIST=( qr/^bibitem$/ ); # patterns for commands which should not be deleted in nominally delete text passages
my @KEEPCMDEXCL=();
my ($i,$j,$l);
my ($old,$new);
my ($line,$key);
my (@dumlist);
my ($newpreamble,$oldpreamble);
my (@newpreamble,@oldpreamble,@diffpreamble,@diffbody);
my ($latexdiffpreamble);
my ($oldbody, $newbody, $diffbo);
my ($oldpost, $newpost);
my ($diffall);
# Option names
my ($type,$subtype,$floattype,$config,$preamblefile,$encoding,$nolabel,$visiblelabel,
$filterscript,$ignorefilterstderr,
$showpreamble,$showsafe,$showtext,$showconfig,$showall,
$replacesafe,$appendsafe,$excludesafe,
$replacetext,$appendtext,$excludetext,
$replacecontext1,$appendcontext1,
$replacecontext2,$appendcontext2,
$help,$verbose,$driver,$version,$ignorewarnings,
$onlyadditions,
$enablecitmark,$disablecitmark,$allowspaces,$flatten,$nolinks,$debug,$earlylatexdiffpreamble); ###$disablemathmark,
my ($mboxsafe);
# MNEMNONICS for mathmarkup
my $mathmarkup;
use constant {
OFF => 0,
WHOLE => 1,
COARSE => 2,
FINE => 3
};
# MNEMNONICS for graphicsmarkup
my $graphicsmarkup;
use constant {
NONE => 0,
NEWONLY => 1,
BOTH => 2
};
my ($mboxcmd);
my (@configlist,@addtoconfiglist,@labels,
@appendsafelist,@excludesafelist,
@appendmboxsafelist,@excludemboxsafelist,
@appendtextlist,@excludetextlist,
@appendcontext1list,@appendcontext2list,
@packagelist);
my ($assign,@config);
# Hash where keys corresponds to the names of all included packages (including the documentclass as another package
# the optional arguments to the package are the values of the hash elements
my ($pkg,%packages);
# Defaults
$mathmarkup=COARSE;
$verbose=0;
$onlyadditions=0;
# output debug and intermediate files, set to 0 in final distribution
$debug=0;
# insert preamble directly after documentclass - experimental feature, set to 0 in final distribution
# Note that this failed with mini example (or other files, where packages used in latexdiff preamble
# are called again with incompatible options in preamble of resulting file)
$earlylatexdiffpreamble=0;
# define character properties
sub IsNonAsciiPunct { return <<'END' # Unicode punctuation but excluding ASCII punctuation
+utf8::IsPunct
-utf8::IsASCII
END
}
sub IsNonAsciiS { return <<'END' # Unicode symbol but excluding ASCII
+utf8::IsS
-utf8::IsASCII
END
}
my %verbhash;
Getopt::Long::Configure('bundling');
GetOptions('type|t=s' => \$type,
'subtype|s=s' => \$subtype,
'floattype|f=s' => \$floattype,
'config|c=s' => \@configlist,
'add-to-config=s' => \@addtoconfiglist,
'preamble|p=s' => \$preamblefile,
'encoding|e=s' => \$encoding,
'label|L=s' => \@labels,
'no-label' => \$nolabel,
'visible-label' => \$visiblelabel,
'exclude-safecmd|A=s' => \@excludesafelist,
'replace-safecmd=s' => \$replacesafe,
'append-safecmd|a=s' => \@appendsafelist,
'exclude-textcmd|X=s' => \@excludetextlist,
'replace-textcmd=s' => \$replacetext,
'append-textcmd|x=s' => \@appendtextlist,
'replace-context1cmd=s' => \$replacecontext1,
'append-context1cmd=s' => \@appendcontext1list,
'replace-context2cmd=s' => \$replacecontext2,
'append-context2cmd=s' => \@appendcontext2list,
'exclude-mboxsafecmd=s' => \@excludemboxsafelist,
'append-mboxsafecmd=s' => \@appendmboxsafelist,
'show-preamble' => \$showpreamble,
'show-safecmd' => \$showsafe,
'show-textcmd' => \$showtext,
'show-config' => \$showconfig,
'show-all' => \$showall,
'packages=s' => \@packagelist,
'allow-spaces' => \$allowspaces,
'math-markup=s' => \$mathmarkup,
'graphics-markup=s' => \$graphicsmarkup,
'enable-citation-markup|enforce-auto-mbox' => \$enablecitmark,
'disable-citation-markup|disable-auto-mbox' => \$disablecitmark,
'verbose|V' => \$verbose,
'ignore-warnings' => \$ignorewarnings,
'driver=s'=> \$driver,
'flatten' => \$flatten,
'filter-script=s' => \$filterscript,
'ignore-filter-stderr' => \$ignorefilterstderr,
'no-links' => \$nolinks,
'no-del' => \$onlyadditions,
'version' => \$version,
'help|h' => \$help,
'debug!' => \$debug ) or die "Use latexdiff -h to get help.\n" ;
if ( $help ) {
usage() ;
}
if ( $version ) {
print STDERR $versionstring ;
exit 0;
}
print STDERR $versionstring if $verbose;
if (defined($showall)){
$showpreamble=$showsafe=$showtext=$showconfig=1;
}
# Default types
$type='UNDERLINE' unless defined($type);
$subtype='SAFE' unless defined($subtype);
# set floattype to IDENTICAL for LABEL and ONLYCHANGEDPAGE subtype, unless it has been set explicitly on the command line
$floattype=($subtype eq 'LABEL' || $subtype eq 'ONLYCHANGEDPAGE') ? 'IDENTICAL' : 'FLOATSAFE' unless defined($floattype);
if ( $subtype eq 'LABEL' ) {
print STDERR "Note that LABEL subtype is deprecated. If possible, use ZLABEL instead (requires zref package)";
}
if (defined($mathmarkup)) {
$mathmarkup=~tr/a-z/A-Z/;
if ( $mathmarkup eq 'OFF' ){
$mathmarkup=OFF;
} elsif ( $mathmarkup eq 'WHOLE' ){
$mathmarkup=WHOLE;
} elsif ( $mathmarkup eq 'COARSE' ){
$mathmarkup=COARSE;
} elsif ( $mathmarkup eq 'FINE' ){
$mathmarkup=FINE;
} elsif ( $mathmarkup !~ m/^[0123]$/ ) {
die "latexdiff Illegal value: ($mathmarkup) for option--math-markup. Possible values: OFF,WHOLE,COARSE,FINE,0-3\n";
}
# else use numerical value
}
# Give filterscript a default empty string
$filterscript="" unless defined($filterscript);
# setting extra preamble commands
if (defined($preamblefile)) {
$latexdiffpreamble=join "\n",(extrapream($preamblefile),"");
} else {
$latexdiffpreamble=join "\n",(extrapream($type,$subtype,$floattype),"");
}
if ( defined($driver) ) {
# for changebar only
$latexdiffpreamble=~s/\[pdftex\]/[$driver]/sg;
}
# setting up @SAFECMDLIST and @SAFECMDEXCL
if (defined($replacesafe)) {
init_regex_arr_ext(\@SAFECMDLIST,$replacesafe);
} else {
init_regex_arr_data(\@SAFECMDLIST, "SAFE COMMANDS");
}
foreach $appendsafe ( @appendsafelist ) {
init_regex_arr_ext(\@SAFECMDLIST, $appendsafe);
}
foreach $excludesafe ( @excludesafelist ) {
init_regex_arr_ext(\@SAFECMDEXCL, $excludesafe);
}
# setting up @MBOXCMDLIST and @MBOXCMDEXCL
foreach $mboxsafe ( @appendmboxsafelist ) {
init_regex_arr_ext(\@MBOXCMDLIST, $mboxsafe);
}
foreach $mboxsafe ( @excludemboxsafelist ) {
init_regex_arr_ext(\@MBOXCMDEXCL, $mboxsafe);
}
# setting up @TEXTCMDLIST and @TEXTCMDEXCL
if (defined($replacetext)) {
init_regex_arr_ext(\@TEXTCMDLIST,$replacetext);
} else {
init_regex_arr_data(\@TEXTCMDLIST, "TEXT COMMANDS");
}
foreach $appendtext ( @appendtextlist ) {
init_regex_arr_ext(\@TEXTCMDLIST, $appendtext);
}
foreach $excludetext ( @excludetextlist ) {
init_regex_arr_ext(\@TEXTCMDEXCL, $excludetext);
}
# setting up @CONTEXT1CMDLIST ( @CONTEXT1CMDEXCL exist but is always empty )
if (defined($replacecontext1)) {
init_regex_arr_ext(\@CONTEXT1CMDLIST,$replacecontext1);
} else {
init_regex_arr_data(\@CONTEXT1CMDLIST, "CONTEXT1 COMMANDS");
}
foreach $appendcontext1 ( @appendcontext1list ) {
init_regex_arr_ext(\@CONTEXT1CMDLIST, $appendcontext1);
}
# setting up @CONTEXT2CMDLIST ( @CONTEXT2CMDEXCL exist but is always empty )
if (defined($replacecontext2)) {
init_regex_arr_ext(\@CONTEXT2CMDLIST,$replacecontext2);
} else {
init_regex_arr_data(\@CONTEXT2CMDLIST, "CONTEXT2 COMMANDS");
}
foreach $appendcontext2 ( @appendcontext2list ) {
init_regex_arr_ext(\@CONTEXT2CMDLIST, $appendcontext2);
}
# setting configuration variables
@config=();
foreach $config ( @configlist ) {
if (-f $config || lc $config eq '/dev/null' ) {
open(FILE,$config) or die ("Couldn't open configuration file $config: $!");
while () {
chomp;
next if /^\s*#/ || /^\s*%/ || /^\s*$/ ;
push (@config,$_);
}
close(FILE);
}
else {
# foreach ( split(",",$config) ) {
# push @config,$_;
# }
push @config,split(",",$config)
}
}
foreach $assign ( @config ) {
$assign=~ m/\s*(\w*)\s*=\s*(\S*)\s*$/ or die "Illegal assignment $assign in configuration list (must be variable=value)";
exists $CONFIG{$1} or die "Unknown configuration variable $1.";
$CONFIG{$1}=$2;
}
my @addtoconfig=();
foreach $config ( @addtoconfiglist ) {
if (-f $config || lc $config eq '/dev/null' ) {
open(FILE,$config) or die ("Couldn't open addd-to-config file $config: $!");
while () {
chomp;
next if /^\s*#/ || /^\s*%/ || /^\s*$/ ;
push (@addtoconfig,$_);
}
close(FILE);
}
else {
# foreach ( split(",",$config) ) {
# push @addtoconfig,$_;
# }
push @addtoconfig,split(",",$config)
}
}
# initialise default lists from DATA
# for those configuration variables, which have not been set explicitly, initiate from list in document
foreach $key ( keys(%CONFIG) ) {
if (!defined $CONFIG{$key}) {
@dumlist=();
init_regex_arr_data(\@dumlist,"$key CONFIG");
$CONFIG{$key}=join(";",@dumlist)
}
}
foreach $assign ( @addtoconfig ) {
###print STDERR "assign:|$assign|\n";
$assign=~ m/\s*(\w*)\s*=\s*(\S*)\s*$/ or die "Illegal assignment $assign in configuration list (must be variable=value)";
exists $CONFIG{$1} or die "Unknown configuration variable $1.";
$CONFIG{$1}.=";$2";
}
# Map from hash to variables (we do this to have more concise code later, change from comma-separated list)
foreach ( keys(%CONFIG) ) {
if ( $_ eq "MINWORDSBLOCK" ) { $MINWORDSBLOCK = $CONFIG{$_}; }
elsif ( $_ eq "FLOATENV" ) { $FLOATENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "ITEMCMD" ) { $ITEMCMD = $CONFIG{$_} ; }
elsif ( $_ eq "LISTENV" ) { $LISTENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "PICTUREENV" ) { $PICTUREENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "MATHENV" ) { $MATHENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "MATHREPL" ) { $MATHREPL = $CONFIG{$_} ; }
elsif ( $_ eq "MATHARRENV" ) { $MATHARRENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "MATHARRREPL" ) { $MATHARRREPL = $CONFIG{$_} ;
print STDERR "WARNING: Setting MATHARRREPL is depracated. Generally deleted math array environments will be set to their starred varieties and the setting of MATHARREPL is ignored.\n\n" unless $MATHARRREPL =~ /eqnarray\*/ ;
}
elsif ( $_ eq "ARRENV" ) { $ARRENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "VERBATIMENV" ) { $VERBATIMENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "VERBATIMLINEENV" ) { $VERBATIMLINEENV = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "CUSTOMDIFCMD" ) { $CUSTOMDIFCMD = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "COUNTERCMD" ) { $COUNTERCMD = liststringtoregex($CONFIG{$_}) ; }
elsif ( $_ eq "SCALEDELGRAPHICS" ) { $SCALEDELGRAPHICS = $CONFIG{$_} ; }
else { die "Unknown configuration variable $_.";}
}
if ( $mathmarkup == COARSE || $mathmarkup == WHOLE ) {
push(@MATHTEXTCMDLIST,qr/^MATHBLOCK(?:$MATHENV|$MATHARRENV|SQUAREBRACKET)$/);
}
foreach $pkg ( @packagelist ) {
map { $packages{$_}="" } split(/,/,$pkg) ;
}
if ($showconfig || $showtext || $showsafe || $showpreamble) {
show_configuration();
exit 0;
}
if ( @ARGV != 2 ) {
print STDERR "2 and only 2 non-option arguments required. Write latexdiff -h to get help\n";
exit(2);
}
# Are extra spaces between command arguments permissible?
my $extraspace;
if ($allowspaces) {
$extraspace='\s*';
} else {
$extraspace='';
}
# append context lists to text lists (as text property is implied)
push @TEXTCMDLIST, @CONTEXT1CMDLIST;
push @TEXTCMDLIST, @CONTEXT2CMDLIST;
push @TEXTCMDLIST, @MATHTEXTCMDLIST if $mathmarkup==COARSE;
# internal additions to SAFECMDLIST
push(@SAFECMDLIST, qr/^QLEFTBRACE$/, qr/^QRIGHTBRACE$/);
# Patterns. These are used by some of the subroutines, too
# I can only define them down here because value of extraspace depends on an option
my $pat0 = '(?:[^{}])*';
my $pat_n = $pat0;
# if you get "undefined control sequence MATHBLOCKmath" error, increase the maximum value in this loop
for (my $i_pat = 0; $i_pat < 20; ++$i_pat){
$pat_n = '(?:[^{}]|\{'.$pat_n.'\}|\\\\\{|\\\\\})*';
# Actually within the text body, quoted braces are replaced in pre-processing. The only place where
# the last part of the pattern matters is when processing the arguments of context2cmds in the preamble
# and these contain a \{ or \} combination, probably rare.
# It should thus be fine to use the simpler version below.
### $pat_n = '(?:[^{}]|\{'.$pat_n.'\})*';
}
my $brat0 = '(?:[^\[\]]|\\\[|\\\])*';
my $brat_n = $brat0;
for (my $i_pat = 0; $i_pat < 4; ++$i_pat){
$brat_n = '(?:[^\[\]]|\['.$brat_n.'\]|\\\[|\\\])*';
### $brat_n = '(?:[^\[\]]|\['.$brat_n.'\])*'; # Version not taking into account escaped \[ and \]
}
my $abrat0 = '(?:[^<>])*';
# variable definitions are in order that they are matched
my $and = '&';
my $quotemarks = '(?:\'\')|(?:\`\`)';
# some common abbreviations involving punctuations within them. They need special treatment because otherwise in some
# circumstances the gnoring of white space differences in conjunction with merging according to MINWORDSBLOCK rule
# could turn 'i.e.' into 'i.\PAR e.' (see https://github.com/ftilmann/latexdiff/issues/269)
# English: i.e., e.g. Deutsch: z.B.
my $abbreviation='(?:i\. ?e\.|e\. ?g\.|z\. ?B\.)' ;
my $number='-?\d*\.\d*';
# word: sequence of letters or accents followed by letter
my $word_cj='\p{Han}|\p{InHiragana}|\p{InKatakana}';
my $word='(?:' . $word_cj . '|(?:(?:[-\w\d*]|\\\\[\"\'\`~^][A-Za-z\*])(?!(?:' . $word_cj . ')))+)';
# quoted underscore - this needs special treatment as perl treats _ as a letter (\w) but latex does not
# such that a\_b would otherwise be interpreted as a{\_}b by latex but a{\_b} by latexdiff
my $quotedunderscore='\\\\_';
# Handle tex \def macro: \def\MAKRONAME#1[#2]#3{DEFINITION}
my $defseq='\\\\def\\\\[\w\d@\*]+(?:#\d+|\[#\d+\])+(?:\{'. $pat_n . '\})?';
my $cmdleftright='\\\\(?:left|right|[Bb]igg?[lrm]?|middle)\s*(?:[<>()\[\]|\.]|\\\\(?:[|{}]|\w+))';
# for selected commands, the number of arguments is known, and we can therefore allow spaces between command and its argument
# Note that it is still expected that the arguments are blocks marked by parentheses rather than single characters, and that intervening comments will inhibit the association
my $predefinedcmdoptseq12='\\\\(?:href|bibfield|bibinfo)\s*(?:\['.$brat_n.'\])?\s*(?:\{'. $pat_n . '\}\s*){2}'; # Commands with one optional and two non-optional arguments
my $predefinedcmdoptseq01='\\\\(?:url|BibitemShut)\s*\s*(?:\{'. $pat_n . '\}\s*){1}'; # Commands with one non-optional argument
# \bibitem in revtex styles appears to be always followed by \BibItemOpen. We bind \BibItemOpen to the bibitem (if present) in order to prevent the comparison algorithm to interpret the \BibItemOpen as an identical part of the sequence; this interpretation can lead to added and removed entries to the reference list to become mixed.
my $predefinedbibitem='\\\\(?:bibitem)\s*(?:\['.$brat_n.'\])?\s*(?:\{'. $pat_n . '\})(?:%?\s*\\\\BibitemOpen)?'; # Commands with one optional and one non-optional arguments
my $predefinedcmdoptseq='(?:'.$predefinedcmdoptseq12.'|'.$predefinedcmdoptseq01.'|'.$predefinedbibitem.')';
# standard $cmdoptseq (default: no intrevening spaces, controlled by extraspcae) - a final open parentheses is merged to the commend if it exists to deal properly with multi-argument text command
my $coords= '[\-.,\s\d]*';
my $cmdoptseq='\\\\[\w\d@\*]+'.$extraspace.'(?:(?:<'.$abrat0.'>|\['.$brat_n.'\]|\{'. $pat_n . '\}|\(' . $coords .'\))'.$extraspace.')*\{?';
# inline math $....$ or \(..\)
### the commented out version is simpler but for some reason cannot cope with newline (in spite of s option) - need to include \newline explicitly
### my $math='\$(?:[^$]|\\\$)*?\$|\\\\[(].*?\\\\[)]';
my $math='\$(?:[^$]|\\\$)*?\$|\\\\[(](?:.|\n)*?\\\\[)]';
### test version (this seems to give the same results as version above)
## the current maths command cannot cope with newline within the math expression
### my $math='\$(?:[^$]|\\\$)*?\$|\\[(].*?\\[)]';
### my $math='\$(?:[^$]|\\\$)*\$';
my $backslashnl='\\\\\n';
my $oneletcmd='\\\\.\*?(?:\['.$brat_n.'\]|\{'. $pat_n . '\})*';
my $comment='%[^\n]*\n';
my $punct='[0.,\/\'\`:;\"\?\(\)\[\]!~\p{IsNonAsciiPunct}\p{IsNonAsciiS}]';
my $mathpunct='[+=<>\-\|]';
# Assembled pattern
my $pat=qr/(?:\A\s*)?(?:${abbreviation}|${and}|${quotemarks}|${number}|${word}|$quotedunderscore|${defseq}|$cmdleftright|${predefinedcmdoptseq}|${cmdoptseq}|${math}|${backslashnl}|${oneletcmd}|${comment}|${punct}|${mathpunct}|\{|\})\s*/ ;
# now we are done setting up and can start working
my ($oldfile, $newfile) = @ARGV;
# check for existence of input files
if ( ! -e $oldfile ) {
die "Input file $oldfile does not exist";
}
if ( ! -e $newfile ) {
die "Input file $newfile does not exist";
}
# set the labels to be included into the file
# first find out which file name is longer for correct alignment
my ($diff,$oldlabel_n_spaces,$newlabel_n_spaces);
$oldlabel_n_spaces = 0;
$newlabel_n_spaces = 0;
$diff = length($newfile) - length($oldfile);
if ($diff > 0) {
$oldlabel_n_spaces = $diff;
}
if ($diff < 0) {
$newlabel_n_spaces = abs($diff);
}
my ($oldtime,$newtime,$oldlabel,$newlabel);
if (defined($labels[0])) {
$oldlabel=$labels[0] ;
} else {
$oldtime=localtime((stat($oldfile))[9]);
$oldlabel="$oldfile " . " "x($oldlabel_n_spaces) . $oldtime;
}
if (defined($labels[1])) {
$newlabel=$labels[1] ;
} else {
$newtime=localtime((stat($newfile))[9]);
$newlabel="$newfile " . " "x($newlabel_n_spaces) . $newtime;
}
$encoding=guess_encoding($newfile) unless defined($encoding);
$encoding = "utf8" if $encoding =~ m/^utf8/i ;
print STDERR "Encoding $encoding\n" if $verbose;
if (lc($encoding) eq "utf8" ) {
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
}
# filter($text)
# Runs $text through the script provided in $filterscript argument, if set
# If not set, just returns $text unchanged.
# If flatten was set, defer filtering to flatten. flatten will run the filter
# on all incoming text prior to its own processing.
# If flatten was not set, filter each of old and new once (see just below this def)
sub filter {
my ($text)=@_;
my ($textout,$pid);
if ($filterscript ne "") {
print STDERR "Passing " . length($text) . " chars to filter script " . $filterscript . "\n" if $verbose;
if ($ignorefilterstderr) {
# If we need to capture and bury STDERR, use the Open3 version, and close CHLD_ERR below.
use IPC::Open3;
# We consume STDERR from the process, and hide it
$pid = open3(\*CHLD_IN, \*CHLD_OUT, \*CHLD_ERR, $filterscript) or die "open3() failed $!";
}
else {
# Capture STDOUT and use as our new $text. Allow STDERR to go to console.
use IPC::Open2;
$pid = open2(\*CHLD_OUT, \*CHLD_IN, $filterscript) or die "open2() failed $!";
}
# Send in $text
print CHLD_IN $text."\n"; # Adding a newline just to make sure there is one.
close CHLD_IN;
# Wait for output and gather it up
while () {
$textout = $textout.$_;
}
if ($ignorefilterstderr) {
close CHLD_ERR; # Enable only if Open3 used above
}
# On the off chance a very long running and/or frequently called script is used.
waitpid( $pid, 0 );
$text = $textout;
print STDERR "Received " . length($text) . " chars after filtering\n" if $verbose;
print STDERR $text if $verbose;
}
return $text;
}
$old=read_file_with_encoding($oldfile,$encoding);
$new=read_file_with_encoding($newfile,$encoding);
if (not defined($flatten)) {
$old=filter($old);
$new=filter($new);
}
# reset time
exetime(1);
($oldpreamble,$oldbody,$oldpost)=splitdoc($old,'\\\\begin\{document\}','\\\\end\{document\}');
($newpreamble,$newbody,$newpost)=splitdoc($new,'\\\\begin\{document\}','\\\\end\{document\}');
if ($flatten) {
$oldbody=flatten($oldbody,$oldpreamble,File::Spec->rel2abs($oldfile),$encoding);
$newbody=flatten($newbody,$newpreamble,File::Spec->rel2abs($newfile),$encoding);
# flatten preamble
$oldpreamble=flatten($oldpreamble,$oldpreamble,File::Spec->rel2abs($oldfile),$encoding);
$newpreamble=flatten($newpreamble,$newpreamble,File::Spec->rel2abs($newfile),$encoding);
}
my @auxlines;
# boolean variab
my ($ulem)=0;
if ( length $oldpreamble && length $newpreamble ) {
# pre-process preamble by looking for commands used in \maketitle (title, author, date etc commands)
# and marking up content with latexdiff markup
@auxlines=preprocess_preamble($oldpreamble,$newpreamble);
@oldpreamble = split /\n/, $oldpreamble;
@newpreamble = split /\n/, $newpreamble;
# If a command is defined in the preamble of the new file, and only uses safe commands, then it can be considered to be safe) (contribution S. Gouezel)
# Base this assessment on the new preamble
add_safe_commands($newpreamble);
# get a list of packages from preamble if not predefined
%packages=list_packages($newpreamble) unless %packages;
if ( %packages && $debug ) { my $key ; foreach $key (keys %packages) { print STDERR "DEBUG \\usepackage[",$packages{$key},"]{",$key,"}\n" ;} }
}
# have to return to all processing to properly add preamble additions based on packages found
if (defined($graphicsmarkup)) {
$graphicsmarkup=~tr/a-z/A-Z/;
if ( $graphicsmarkup eq 'OFF' or $graphicsmarkup eq 'NONE' ) {
$graphicsmarkup=NONE;
} elsif ( $graphicsmarkup eq 'NEWONLY' or $graphicsmarkup eq 'NEW-ONLY' ) {
$graphicsmarkup=NEWONLY;
} elsif ( $graphicsmarkup eq 'BOTH' ) {
$graphicsmarkup=BOTH;
} elsif ( $graphicsmarkup !~ m/^[012]$/ ) {
die "latexdiff Illegal value: ($graphicsmarkup) for option --highlight-graphics. Possible values: OFF,WHOLE,COARSE,FINE,0-2\n";
}
# else use numerical value
} else {
# Default: no explicit setting in menu
if ( defined $packages{"graphicx"} or defined $packages{"graphics"} ) {
$graphicsmarkup=NEWONLY;
} else {
$graphicsmarkup=NONE;
}
}
if (defined $packages{"hyperref"} ) {
# deleted lines should not generate or appear in link names:
print STDERR "hyperref package detected.\n" if $verbose ;
$latexdiffpreamble =~ s/\{\\DIFadd\}/{\\DIFaddtex}/g;
$latexdiffpreamble =~ s/\{\\DIFdel\}/{\\DIFdeltex}/g;
$latexdiffpreamble .= join "\n",(extrapream("HYPERREF"),"");
if($nolinks){
$latexdiffpreamble .= "\n\\hypersetup{bookmarks=false}";
}
### $latexdiffpreamble .= '%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF FOR HYPERREF PACKAGE' . "\n";
### $latexdiffpreamble .= '\providecommand{\DIFadd}[1]{\texorpdfstring{\DIFaddtex{#1}}{#1}}' . "\n";
### $latexdiffpreamble .= '\providecommand{\DIFdel}[1]{\texorpdfstring{\DIFdeltex{#1}}{}}' . "\n";
### $latexdiffpreamble .= '%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF FOR HYPERREF PACKAGE' . "\n";
}
# add commands for figure highlighting to preamble
if ($graphicsmarkup != NONE ) {
my @matches;
# Check if \DIFaddbeginFL definition calls \DIFaddbegin - if so we will issue an error message that graphics highlighting is
# is not compatible with this.
# (A more elegant solution would be to suppress the redefinitions of the \DIFaddbeginFL etc commands, but for this narrow use case
# I currently don't see this as an efficient use of time)
### The foreach loop does not make sense here. I don't know why I put this in - (F Tilmann)
###foreach my $cmd ( "DIFaddbegin","DIFaddend","DIFdelbegin","DIFdelend" ) {
@matches=( $latexdiffpreamble =~ m/command\{\\DIFaddbeginFL}\{($pat_n)}/sg ) ;
# we look at the last one of the list to take into account possible redefinition but almost always matches should have exactly one element
if ( $matches[$#matches] =~ m/\\DIFaddbegin/ ) {
die "Cannot combine graphics markup with float styles defining \\DIFaddbeginFL in terms of \\DIFaddbegin. Use --graphics-markup=none option or choose a different float style.";
exit 10;
}
###}
$latexdiffpreamble .= join "\n",("\\newcommand{\\DIFscaledelfig}{$SCALEDELGRAPHICS}",extrapream("HIGHLIGHTGRAPHICS"),"");
# only change required for highlighting both is to declare \includegraphics safe, as preamble already contains commands for deleted environment
if ( $graphicsmarkup == BOTH ) {
init_regex_arr_list(\@SAFECMDLIST,'includegraphics');
}
}
$ulem = ($latexdiffpreamble =~ /\\RequirePackage(?:\[$brat_n\])?\{ulem\}/ || defined $packages{"ulem"});
# If amsmath is defined and $ulem is used for markup, redefine the \sout command to also work (mostly) in math mode
# See stack exchange https://tex.stackexchange.com/questions/20609/strikeout-in-math-mode/308647#308647 based on comment by Taylor Raine
if ( defined($packages{'amsmath'}) and $ulem ) {
$latexdiffpreamble .= join "\n",(extrapream('AMSMATHULEM'),"");
}
# If listings is being used or can be found in the latexdiff search path, add to the preamble auxiliary code to enable line-by-line markup
if ( defined($packages{"listings"}) or `kpsewhich listings.sty` ne "" ) {
my @listingpreamble=extrapream("LISTINGS");
if ($latexdiffpreamble =~ /\\RequirePackage(?:\[$brat_n\])?\{color\}/ ) {
@listingpreamble=extrapream("COLORLISTINGS");
}
my @listingDIFcode=();
my $replaced;
# note that in case user supplies preamblefile the type might not reflect well the actual markup style
@listingDIFcode=extrapream("-nofail","DIFCODE_" . $type) unless defined($preamblefile);
if (!(@listingDIFcode)) {
# if listingDIFcode is empty try to guess a suitable one from the preamble
if ($latexdiffpreamble =~ /\\RequirePackage(?:\[$brat_n\])?\{color\}/ and $ulem ) {
@listingDIFcode=extrapream("DIFCODE_UNDERLINE");
} elsif ( $latexdiffpreamble =~ /\\RequirePackage(?:\[$brat_n\])?\{color\}/ ) {
# only colour used
@listingDIFcode=extrapream("DIFCODE_CFONT");
} else {
# fall-back solution
@listingDIFcode=extrapream("DIFCODE_BOLD");
}
}
# add configuration so that listings work with utf-8
push @listingpreamble, '\lstset{extendedchars=\true,inputencoding='.$encoding."}\n";
# now splice it in
$replaced=0;
###print STDERR "DEBUG: listingDIFcode: ",join("\n",@listingDIFcode),"|||\n" if $debug;
@listingpreamble=grep {
# only replace if this has not been done already (use short-circuit property of and)
if (!$replaced and $_ =~ s/^.*%DIFCODE TEMPLATE.*$/join("\n",@listingDIFcode)/e ) {
###print STDERR "DEBUG: Replaced text $_\n" if $debug;
$replaced=1;
1;
} else {
# return false for those lines matching %DIFCODE TEMPLATE (so that they are not included in output)
not m/%DIFCODE TEMPLATE/;
}
} @listingpreamble;
### print STDERR "DEBUG: listingpreamble @listingpreamble\n";
$latexdiffpreamble .= join "\n",(@listingpreamble,"");
} else {
print STDERR "WARNING: listings package not detected. Disabling mark-up in verbatim environments \n" ;
# if listings does not exist disable line-by-line markup and treat all verbatim environments as opaque
$VERBATIMENV = liststringtoregex($CONFIG{VERBATIMENV}.";".$CONFIG{VERBATIMLINEENV});
$VERBATIMLINEENV = "";
}
# adding begin and end marker lines to preamble
$latexdiffpreamble = "%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF\n" . $ latexdiffpreamble . "%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF\n";
# and return to preamble specific processing
if ( length $oldpreamble && length $newpreamble ) {
print STDERR "Differencing preamble.\n" if $verbose;
# insert dummy first line such that line count begins with line 1 (rather than perl's line 0) - just so that line numbers inserted by linediff are correct
unshift @newpreamble,'';
unshift @oldpreamble,'';
@diffpreamble = linediff(\@oldpreamble, \@newpreamble);
# remove dummy line again
shift @diffpreamble;
# add filenames, modification time and latexdiff mark
defined($nolabel) or splice @diffpreamble,1,0,
"%DIF LATEXDIFF DIFFERENCE FILE",
,"%DIF DEL $oldlabel",
"%DIF ADD $newlabel";
if ( @auxlines ) {
push @diffpreamble,"%DIF DELETED TITLE COMMANDS FOR MARKUP";
push @diffpreamble,join("\n",@auxlines);
}
if ( $earlylatexdiffpreamble) {
# insert latexdiff command directly after documentclass at beginning of preamble
# note that grep is only run for its side effect
( grep { s/^([^%]*\\documentclass.*)$/$1$latexdiffpreamble/ } @diffpreamble )==1 or die "Could not find documentclass statement in preamble";
} else {
# insert latexdiff commands at the end of preamble (default behaviour)
push @diffpreamble,$latexdiffpreamble;
}
push @diffpreamble,'\begin{document}';
if (defined $packages{"hyperref"} && $nolinks) {
push @diffpreamble, '\begin{NoHyper}';
}
}
elsif ( !length $oldpreamble && !length $newpreamble ) {
@diffpreamble=();
} else {
print STDERR "Either both texts must have preamble or neither text must have the preamble.\n";
exit(2);
}
# Special: treat all cite commands as safe except in UNDERLINE and FONTSTRIKE mode
# (there is a conflict between citation and ulem package, see
# package documentation)
# Use post-processing
# and $packages{"apacite"}!~/natbibpapa/
if (defined $packages{"units"} && $ulem ) {
# protect inlined maths environments by surrounding with an \mbox
# this is done to get around an incompatibility between the ulem and units package
# where spaces in the argument to underlined or crossed-out \unit commands cause an error message
print STDERR "units package detected at the same time as style using ulem.\n" if $verbose ;
$MBOXINLINEMATH=1;
}
if (defined $packages{"siunitx"} ) {
# protect SI command by surrounding them with an \mbox
# this is done to get around an incompatibility between the ulem and siunitx package
print STDERR "siunitx package detected.\n" if $verbose ;
my $mboxcmds='SI,ang,numlist,numrange,SIlist,SIrange,qty,qtylist,qtyproduct,qtyrange,complexqty';
init_regex_arr_list(\@SAFECMDLIST,'num,si,numproduct,unit,complexnum');
if ( $enablecitmark || ( $ulem && ! $disablecitmark )) {
init_regex_arr_list(\@MBOXCMDLIST,$mboxcmds);
} else {
init_regex_arr_list(\@SAFECMDLIST,$mboxcmds);
}
}
if (defined $packages{"cleveref"} ) {
# protect selected command by surrounding them with an \mbox
# this is done to get around an incompatibility between ulem and cleveref package
print STDERR "cleveref package detected.\n" if $verbose ;
my $mboxcmds='[Cc]ref(?:range)?\*?,labelcref,(?:lc)?name[cC]refs?' ;
if ( $enablecitmark || ( $ulem && ! $disablecitmark )) {
init_regex_arr_list(\@MBOXCMDLIST,$mboxcmds);
} else {
init_regex_arr_list(\@SAFECMDLIST,$mboxcmds);
}
}
if (defined $packages{"glossaries"} ) {
# protect selected command by surrounding them with an \mbox
# this is done to get around an incompatibility between ulem and glossaries package
print STDERR "glossaries package detected.\n" if $verbose ;
my $mboxcmds='[gG][lL][sS](?:|pl|disp|link|first|firstplural|desc|user[iv][iv]?[iv]?),[aA][cC][rR](?:long|longpl|full|fullpl),[aA][cC][lfp]?[lfp]?';
init_regex_arr_list(\@SAFECMDLIST,'[gG][lL][sS](?:(?:entry)?(?:text|plural|name|symbol)|displaynumberlist|entryfirst|entryfirstplural|entrydesc|entrydescplural|entrysymbolplural|entryuser[iv][iv]?[iv]?|entrynumberlist|entrydisplaynumberlist|entrylong|entrylongpl|entryshort|entryshortpl|entryfull|entryfullpl),[gG]lossentry(?:name|desc|symbol),[aA][cC][rR](?:short|shortpl),[aA]csp?');
if ( $enablecitmark || ( $ulem && ! $disablecitmark )) {
init_regex_arr_list(\@MBOXCMDLIST,$mboxcmds);
} else {
init_regex_arr_list(\@SAFECMDLIST,$mboxcmds);
}
}
if (defined $packages{"chemformula"} or defined $packages{"chemmacros"} ) {
print STDERR "chemformula package detected.\n" if $verbose ;
init_regex_arr_list(\@SAFECMDLIST,'ch');
push(@UNSAFEMATHCMD,'ch');
# The next command would be needed to allow highlighting the interior of \ch commands in math environments
# but the redefinitions in chemformula are too deep to make this viable
# push(@MATHTEXTCMDLIST,'ch');
}
if (defined $packages{"mhchem"} ) {
print STDERR "mhchem package detected.\n" if $verbose ;
init_regex_arr_list(\@SAFECMDLIST,'ce');
push(@UNSAFEMATHCMD,'ce','cee');
# The next command would be needed to allow highlighting the interior of \cee commands in math environments
# but the redefinitions in chemformula are too deep to make this viable
# push(@MATHTEXTCMDLIST,'cee');
}
if ( defined $packages{"tikz-dependency"} ) {
init_regex_arr_ext(\@SAFECMDEXCL, 'AMPERSAND');
}
my ( $citpat);
if ( defined $packages{"apacite"} ) {
print STDERR "apacite package detected.\n" if $verbose ;
$citpat='(?:mask)?(?:full|short|no)?cite(?:A|author|year|meta)?(?:NP)?';
} elsif (defined $packages{"biblatex"}) {
print STDERR "biblatex package detected.\n" if $verbose ;
$citpat='(?:[cC]ites?|(?:[pP]aren|foot|[Tt]ext|[sS]mart|super)cites?\*?|footnotecitetex)';
push(@TEXTCMDEXCL, qr/^textcite$/);
} else {
# citation command pattern for all other citation schemes
$citpat='(?:cite\w*|nocite)';
};
if ( ! $ulem ) {
# modes not using ulem: citation is safe
push (@SAFECMDLIST, $citpat);
} else {
### Experimental: disable text and emph commands
push(@SAFECMDEXCL, qr/^emph$/, qr/^text..$/);
# replace \cite{..} by \mbox{\cite{..}} in added or deleted blocks in post-processing
push(@MBOXCMDLIST,$citpat) unless $disablecitmark;
if ( uc($subtype) eq "COLOR" or uc($subtype) eq "DVIPSCOL" ) {
# remove \cite command again from list of safe commands
pop @MBOXCMDLIST;
# deleted cite commands
}
}
push(@MBOXCMDLIST,$citpat) if $enablecitmark ;
if (defined $packages{"amsmath"} or defined $packages{"amsart"} or defined $packages{"amsbook"} ) {
print STDERR "amsmath package detected.\n" if $verbose ;
$MATHARRREPL='align*';
}
# add commands in MBOXCMDLIST to SAFECMDLIST
foreach $mboxcmd ( @MBOXCMDLIST ) {
init_regex_arr_list(\@SAFECMDLIST, $mboxcmd);
}
# check if \label is in SAFECMDLIST, and if yes replace "label" in $LABELCMD by something that never matches (we hope!)
if ( iscmd("label",\@SAFECMDLIST,\@SAFECMDEXCL) ) {
$LABELCMD=~ s/label/NEVERMATCHLABEL/;
}
print STDERR "Preprocessing body. " if $verbose;
preprocess($oldbody,$newbody);
writedebugfile($oldbody,'old-preprocess');
writedebugfile($newbody,'new-preprocess');
# run difference algorithm
@diffbody=bodydiff($oldbody, $newbody);
$diffbo=join("",@diffbody);
writedebugfile($diffbo,"bodydiff");
print STDERR "(",exetime()," s)\n","Postprocessing body. \n" if $verbose;
postprocess($diffbo);
$diffall =join("\n",@diffpreamble) ;
# add visible labels
if (defined($visiblelabel)) {
# Give information right after \begin{document} (or at the beginning of the text for files without preamble
### if \date command is used, add information to \date argument, otherwise give right after \begin{document}
### $diffall=~s/(\\date$extraspace(?:\[$brat0\])?$extraspace)\{($pat_n)\}/$1\{$2 \\ LATEXDIFF comparison \\ Old: $oldlabel \\ New: $newlabel \}/ or
$diffbo = "\\begin{verbatim}LATEXDIFF comparison\nOld: $oldlabel\nNew: $newlabel\\end{verbatim}\n$diffbo" ;
}
$diffall .= "$diffbo" ;
if (defined $packages{"hyperref"} && $nolinks) {
$diffall .= "\\end{NoHyper}\n";
}
$diffall .= "\\end{document}$newpost" if length $newpreamble ;
if ( lc($encoding) ne "utf8" && lc($encoding) ne "ascii" ) {
print STDERR "Encoding output file to $encoding\n" if $verbose;
$diffall=Encode::encode($encoding,$diffall);
binmode STDOUT;
}
print $diffall;
print STDERR "(",exetime()," s)\n","Done.\n" if $verbose;
# liststringtoregex(liststring)
# expands string with semi-colon separated list into a regular expression corresponding
# matching any of the elements
sub liststringtoregex {
my ($liststring)=@_;
my @elements=grep /\S/,split(";",$liststring);
if ( @elements) {
return('(?:(?:' . join(')|(?:',@elements) .'))');
} else {
return "";
}
}
# show_configuration
# note that this is not encapsulated but uses variables from the main program
# It is provided for convenience because in the future it is planned to allow output
# to be modified based on what packages are read etc - this works only if the input files are actually read
# whether or not additional files are provided
sub show_configuration {
if ($showpreamble) {
print "\nPreamble commands:\n";
print $latexdiffpreamble ;
}
if ($showsafe) {
print "\nsafecmd: Commands safe within scope of $ADDOPEN $ADDCLOSE and $DELOPEN $DELCLOSE (unless excluded):\n";
print_regex_arr(@SAFECMDLIST);
print "\nsafecmd-exlude: Commands not safe within scope of $ADDOPEN $ADDCLOSE and $DELOPEN $DELCLOSE :\n";
print_regex_arr(@SAFECMDEXCL);
print "\nmboxsafecmd: Commands safe only if they are surrounded by \\mbox command:\n";
print_regex_arr(@MBOXCMDLIST);
print "\nnmboxsafecmd: Commands not safe:\n";
print_regex_arr(@MBOXCMDEXCL);
}
if ($showtext) {
print "\nCommands with last argument textual (unless excluded) and safe in every context:\n";
print_regex_arr(@TEXTCMDLIST);
print "\nContext1 commands (last argument textual, command will be disabled in deleted passages, last argument will be shown as plain text):\n";
print_regex_arr(@CONTEXT1CMDLIST);
print "\nContext2 commands (last argument textual, command and its argument will be disabled in deleted passages):\n";
print_regex_arr(@CONTEXT2CMDLIST);
print "\nExclude list of Commands with last argument not textual (overrides patterns above):\n";
print_regex_arr(@TEXTCMDEXCL);
}
if ($showconfig) {
print "Configuration variables:\n";
print "ARRENV=$ARRENV\n";
print "COUNTERCMD=$COUNTERCMD\n";
print "FLOATENV=$FLOATENV\n";
print "ITEMCMD=$ITEMCMD\n";
print "LISTENV=$LISTENV\n";
print "MATHARRENV=$MATHARRENV\n";
# print "MATHARRREPL=$MATHARRREPL\n"; # this is not deprecated and thus no longer shown
print "MATHENV=$MATHENV\n";
print "MATHREPL=$MATHREPL\n";
print "MINWORDSBLOCK=$MINWORDSBLOCK\n";
print "PICTUREENV=$PICTUREENV\n";
print "SCALEDELGRAPHICS=$SCALEDELGRAPHICS\n";
print "VERBATIMENV=$VERBATIMENV\n";
print "VERBATIMLINEENV=$VERBATIMLINEENV\n";
print "CUSTOMDIFCMD=$CUSTOMDIFCMD\n";
}
}
## guess_encoding(filename)
## reads the first 20 lines of filename and looks for call of inputenc package
## if found, return the option of this package (encoding), otherwise return utf8
sub guess_encoding {
my ($filename)=@_;
my ($i,$enc);
open (FH, $filename) or die("Couldn't open $filename: $!");
$i=0;
while () {
next if /^\s*%/; # skip comment lines
if (m/\\usepackage\[(\w*?)\]\{inputenc\}/) {
close(FH);
return($1);
}
last if (++$i > 20 ); # scan at most 20 non-comment lines
}
close(FH);
### return("ascii");
return("utf8");
}
sub read_file_with_encoding {
my ($output);
my ($filename, $encoding) = @_;
if (lc($encoding) eq "utf8" ) {
open (FILE, "<:utf8",$filename) or die("Couldn't open $filename: $!");
local $/ ; # locally set record operator to undefined, ie. enable whole-file mode
$output=;
} elsif ( lc($encoding) eq "ascii") {
open (FILE, $filename) or die("Couldn't open $filename: $!");
local $/ ; # locally set record operator to undefined, ie. enable whole-file mode
$output=;
} else {
require Encode;
open (FILE, "<",$filename) or die("Couldn't open $filename: $!");
local $/ ; # locally set record operator to undefined, ie. enable whole-file mode
$output=;
print STDERR "Converting $filename from $encoding to utf8\n" if $verbose;
$output=Encode::decode($encoding,$output);
}
close FILE;
if ($^O eq "linux" ) {
$output =~ s/\r\n/\n/g ;
}
return $output;
}
## %packages=list_packages(@preamble)
## scans the arguments for \documentclass,\RequirePackage and \usepackage statements and constructs a hash
## whose keys are the included packages, and whose values are the associated optional arguments
#sub list_packages {
# my (@preamble)=@_;
# my %packages=();
# foreach $line ( @preamble ) {
# # get rid of comments
# $line=~s/(?rel2abs( $filename ) ;
($ignore, $dirname, $fileonly) = File::Spec->splitpath($filename) ;
$bblfile = $filename;
$bblfile=~s/\.tex$//;
$bblfile.=".bbl";
if ( ($includeonly) = ($preamble =~ m/\\includeonly\{(.*?)\}/ ) ) {
$includeonly =~ s/,/|/g;
} else {
$includeonly = '.*?';
}
print STDERR "DEBUG: includeonly $includeonly\n" if $debug;
# Run through filter, to let filterscript have a pass if it was set
$text = filter($text);
# Recursively replace \\import, \\subimport, and related import commands
$text =~ s/(^(?:[^%\n]|\\%)*)(\\(sub)?(?:import|inputfrom|includefrom))\{(.*?)\}(?:[\s]*)\{(.*?)\}/{
# (--------1-------)(--(=3=)-------------2-------------------) (-4-) (-5-)
# $1 is begline
# $2 is the import macro name
# $3 is (optional) prefix "sub"
# $4 is directory
# $5 is filename
$begline = (defined($1)? $1 : "");
$subdir = $4;
$fname = $5;
$fname .= ".tex" unless $fname =~ m|\.\w{3,4}$|;
print STDERR "DEBUG begline:", $begline, "\n" if $debug;
print STDERR "DEBUG", (defined($3)? "subimport_file:" : "import_file:"), $subdir, "\n" if $debug;
print STDERR "DEBUG file:", $fname, "\n" if $debug;
# subimport appends $subdir to the current $dirname. import replaces it with an absolute path.
$subdirfull = (defined($3) ? File::Spec->catdir($dirname,$subdir) : $subdir);
$importfilepath = File::Spec->catfile($subdirfull, $fname);
print STDERR "importing importfilepath:", $importfilepath,"\n" if $verbose;
if ( -f $importfilepath ) {
# If file exists, replace input or include command with expanded input
#TODO: need remove_endinput & newpage similar to other replacements inside flatten
$replacement=flatten(read_file_with_encoding($importfilepath, $encoding), $preamble,$importfilepath,$encoding);
} else {
# if file does not exist, do not expand include or input command (do not warn if fname contains #[0-9] as it is then likely part of a command definition
# and is not meant to be expanded directly
print STDERR "WARNING: Could not find included file ",$importfilepath,". I will continue but not expand |$2|\n";
$replacement = $2;
$replacement .= "{$subdir}{$fname} % Processed";
}
"$begline$replacement";
}/exgm;
# recursively replace \\input and \\include files
$text =~ s/(^(?:[^%\n]|\\%)*)(\\input\{(.*?)\}|\\include\{(${includeonly}(?:\.tex)?)\})/{
$begline=(defined($1)? $1 : "") ;
$inputcmd=$2;
$fname = $3 if defined($3) ;
$fname = $4 if defined($4) ;
$newpage=(defined($4)? " \\newpage " : "") ;
# # add tex extension unless there is a three or four letter extension already
$fname .= ".tex" unless $fname =~ m|\.\w{3,4}$|;
$fullfile = File::Spec->catfile($dirname,$fname);
print STDERR "DEBUG Beg of line match |$1|\n" if defined($1) && $debug ;
print STDERR "Include file $fname\n" if $verbose;
print STDERR "DEBUG looking for file ",$fullfile, "\n" if $debug;
# content of file becomes replacement value (use recursion), add \newpage if the command was include
if ( -f $fullfile ) {
# If file exists, replace input or include command with expanded input
$replacement=flatten(read_file_with_encoding($fullfile, $encoding), $preamble,$filename,$encoding);
$replacement = remove_endinput($replacement);
# \include always starts a new page; use explicit \newpage command to simulate this
} else {
# if file does not exist, do not expand include or input command (do not warn if fname contains #[0-9] as it is then likely part of a command definition
# and is not meant to be expanded directly
print STDERR "WARNING: Could not find included file ",$fullfile,". I will continue but not expand |$inputcmd|\n" unless $fname =~ m(#[0-9]) ;
$replacement = $inputcmd ; # i.e. just the original command again -> make no change file does not exist
$newpage="";
}
"$begline$newpage$replacement$newpage";
}/exgm;
# replace bibliography with bbl file if it exists
$text=~s/(^(?:[^%\n]|\\%)*)\\bibliography\{(.*?)\}/{
if ( -f $bblfile ){
$replacement=read_file_with_encoding(File::Spec->catfile($bblfile), $encoding);
} else {
warn "Bibliography file $bblfile cannot be found. No flattening of \\bibliography done. Run bibtex on old and new files first";
$replacement="\\bibliography{$2}";
}
$begline=(defined($1)? $1 : "") ;
"$begline$replacement";
}/exgm;
# replace subfile with contents (subfile package)
$text=~s/(^(?:[^%\n]|\\%)*)\\subfile\{(.*?)\}/{
$begline=(defined($1)? $1 : "") ;
$fname = $2;
# # add tex extension unless there is a three or four letter extension already
$fname .= ".tex" unless $fname =~ m|\.\w{3,4}|;
print STDERR "Include file as subfile $fname\n" if $verbose;
# content of file becomes replacement value (use recursion)
# now strip away everything outside and including \begin{document} and \end{document} pair#
# # note: no checking for comments is made
$fullfile=File::Spec->catfile($dirname,$fname);
if ( -f $fullfile) {
# if file exists, expand \subfile command by contents of file
$subfile=read_file_with_encoding($fullfile,$encoding) or die "Could not open included subfile ",$fullfile,": $!";
($subpreamble,$subbody,$subpost)=splitdoc($subfile,'\\\\begin\{document\}','\\\\end\{document\}');
### $subfile=~s|^.*\\begin{document}||s;
### $subfile=~s|\\end{document}.*$||s;
$replacement=flatten($subbody, $preamble,$fullfile,$encoding);
### $replacement = remove_endinput($replacement);
} else {
# if file does not exist, do not expand subfile
print STDERR "WARNING: Could not find subfile ",$fullfile,". I will continue but not expand |$2|\n" unless $fname =~ m(#[0-9]) ;
$replacement = "\\subfile\{$2\}" ; # i.e. just the original command again -> make no change file does not exist
}
"$begline$replacement";
}/exgm;
# replace \verbatiminput and \lstlistinginput
$text=~s/(^(?:[^%\n]|\\%)*)\\(verbatiminput\*?|lstinputlisting)$extraspace(\[$brat_n\])?$extraspace\{(.*?)\}/{
$begline=(defined($1)? $1 : "") ;
$command = $2 ;
$fname = $4 ;
$verboptions = defined($3)? $3 : "" ;
if ($command eq 'verbatiminput' ) {
$verbenv = "verbatim" ;
} elsif ($command eq 'verbatiminput*' ) {
$verbenv = "verbatim*" ;
} elsif ($command eq 'lstinputlisting' ) {
$verbenv = "lstlisting" ;
} else {
die "Internal errorL Unexpected verbatim input type $command.\n";
}
print STDERR "DEBUG Beg of line match |$begline|\n" if $debug ;
print STDERR "Include file $fname verbatim\n" if $verbose;
print STDERR "DEBUG looking for file ",File::Spec->catfile($dirname,$fname), "\n" if $debug;
# content of file becomes replacement value (do not use recursion), add \newpage if the command was include
###$replacement=read_file_with_encoding(File::Spec->catfile($dirname,$fname), $encoding) or die "Couldn't find file ",File::Spec->catfile($dirname,$fname),": $!";
$replacement=read_file_with_encoding(File::Spec->catfile($dirname,$fname), $encoding); # (cannot on apparent failure as this triggers for empty fie. Original: or die "Couldn't find file ",File::Spec->catfile($dirname,$fname),": $!";
# Add a new line if it not already there (note that the matching operator needs to use different delimiters, as we are still inside an outer scope that takes precedence
$replacement .= "\n" unless $replacement =~ m(\n$) ;
"$begline\\begin{$verbenv}$verboptions\n$replacement\\end{$verbenv}\n";
}/exgm;
return($text);
}
# print_regex_arr(@arr)
# prints regex array without x-ism expansion put in by pearl to stdout
sub print_regex_arr {
my $dumstring;
$dumstring = join(" ",@_); # PERL generates string (?-xism:^ref$) for quoted refex ^ref$
$dumstring =~ s/\(\?-xism:\^(.*?)\$\)/$1/g; # remove string and ^,$ marks before output
print $dumstring,"\n";
}
# @lines=extrapream($type,...)
# reads line from appendix or external file
# (end of file after __END__ token)
# if $type is a filename, it will read the file instead of reading from the appendix
# otherwise it will screen appendix for line "%DIF $TYPE" and copy everything up to line
# '%DIF END $TYPE' (where $TYPE is upcased version of $type)
# extrapream('-nofail',$type) will---instead of failing---simply return nothing if
# it does not find the matching line in appendix (do not use -nofail option with multiple types!)
sub extrapream {
my @types=@_;
my ($type,$arg);
my $nofail=0;
###my @retval=("%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF") ;
my @retval=();
my ($copy);
foreach $arg ( @types ) {
if ( $arg eq '-nofail' ) {
$nofail=1;
next;
}
$type=$arg;
$copy=0;
if ( -f $type || lc $type eq '/dev/null' ) {
print STDERR "Reading preamble file $type\n" if $verbose ;
open (FILE,$type) or die "Cannot open preamble file $type: $!";
if (defined($encoding)) {
binmode(FILE,":encoding($encoding)");
} else {
require Encode::Locale;
binmode(FILE,":encoding(locale)");
}
while () {
chomp ;
if ( $_ =~ m/%DIF PREAMBLE/ ) {
push (@retval,"$_");
} else {
push (@retval,"$_ %DIF PREAMBLE");
}
}
} else { # not (-f $type)
$type=uc($type); # upcase argument
print STDERR "Preamble Internal Type $type\n" if $verbose;
# save filehandle position (before first read this points to line after __END__)
# but seek DATA,0,0 resets it to the beginning of the file
# see https://stackoverflow.com/questions/4459601/how-can-i-use-data-twice
my $data_start = tell DATA;
while () {
if ( m/^%DIF $type/ ) {
$copy=1;
} elsif ( m/^%DIF END $type/ ) {
last;
}
chomp;
push (@retval,"$_ %DIF PREAMBLE") if $copy;
}
if ( $copy == 0 ) {
unless ($nofail) {
print STDERR "\nPreamble style $type not implemented.\n";
print STDERR "Write latexdiff -h to get help with available styles\n";
exit(2);
}
}
seek DATA,$data_start,0; # rewind DATA handle to beginning of data record
}
}
###push (@retval,"%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF") ;
return @retval;
}
# ($part1,$part2,$part3)=splitdoc($text,$word1,$word2)
# splits $text into 3 parts at $word1 and $word2.
# if neither $word1 nor $word2 exist, $part1 and $part3 are empty, $part2 is $text
# If only $word1 or $word2 exist but not the other, output an error message.
# NB this version avoids $` and $' for performance reason although it only makes a tiny difference
# (in one test gain a tenth of a second for a 30s run)
sub splitdoc {
my ($text,$word1,$word2)=@_;
my ($part1,$part2,$part3)=("","","");
my ($rest,$pos);
if ( $text =~ m/(^[^%]*)($word1)/mg ) {
$pos=pos $text;
$part1=substr($text,0,$pos-length($2));
$rest=substr($text,$pos);
if ( $rest =~ m/(^[^%]*)($word2)/mg ) {
$pos=pos $rest;
$part2=substr($rest,0,$pos-length($2));
$part3=substr($rest,$pos);
}
else {
die "$word1 and $word2 not in the correct order or not present as a pair." ;
}
} else {
$part2=$text;
die "$word2 present but not $word1." if ( $text =~ m/(^[^%]*)$word2/ms );
}
return ($part1,$part2,$part3);
}
# bodydiff($old,$new)
sub bodydiff {
my ($oldwords, $newwords) = @_;
my @retwords;
print STDERR "(",exetime()," s)\n","Splitting into latex tokens \n" if $verbose;
print STDERR "Parsing $oldfile \n" if $verbose;
my @oldwords = splitlatex($oldwords);
print STDERR "Parsing $newfile \n" if $verbose;
my @newwords = splitlatex($newwords);
if ( $debug ) {
open(TOKENOLD,">","latexdiff.debug.tokenold");
print TOKENOLD join("***\n",@oldwords);
close(TOKENOLD);
open(TOKENNEW,">","latexdiff.debug.tokennew");
print TOKENNEW join("***\n",@newwords);
close(TOKENNEW);
}
print STDERR "(",exetime()," s)\n","Pass 1: Expanding text commands and merging isolated identities with changed blocks " if $verbose;
pass1(\@oldwords, \@newwords);
print STDERR "(",exetime()," s)\n","Pass 2: inserting DIF tokens and mark up. " if $verbose;
if ( $debug ) {
open(TOKENOLD,">","latexdiff.debug.tokenold2");
print TOKENOLD join("***\n",@oldwords);
close(TOKENOLD);
open(TOKENNEW,">","latexdiff.debug.tokennew2");
print TOKENNEW join("***\n",@newwords);
close(TOKENNEW);
}
@retwords=pass2(\@oldwords, \@newwords);
return(@retwords);
}
# @words=splitlatex($string)
# split string according to latex rules
# Each element of words is either
# a word (including trailing spaces and punctuation)
# a latex command
# if there is white space in the beginning return that as first token
sub splitlatex {
my ($inputstring) = @_ ;
my $string=$inputstring ;
# if input is empty, return empty list
length($string)>0 or return ();
$string=~s/^(\s*)//s;
my $leadin=$1;
length($string)>0 or return ($leadin);
my @retval=($string =~ m/$pat/osg);
if (length($string) != length(join("",@retval))) {
print STDERR "\nWARNING: Inconsistency in length of input string and parsed string:\n This often indicates faulty or non-standard latex code.\n In many cases you can ignore this and the following warning messages.\n Note that character numbers in the following are counted beginning after \\begin{document} and are only approximate." unless $ignorewarnings;
print STDERR "DEBUG Original length ",length($string)," Parsed length ",length(join("",@retval)),"\n" if $debug;
print STDERR "DEBUG Input string: |$string|\n" if (length($string)<500) && $debug;
print STDERR "DEBUG Token parsing: |",join("+",@retval),"|\n" if (length($string)<500) && $debug ;
@retval=();
# slow way only do this if other m//sg method fails
my $last = 0;
while ( $string =~ m/$pat/osg ) {
my $match=$&;
if ($last + length $& != pos $string ) {
my $pos=pos($string);
my $offset=30<$last ? 30 : $last;
my $dum=substr($string,$last-$offset,$pos-$last+2*$offset);
my $dum1=$dum;
my $cnt=$#retval;
my $i;
$dum1 =~ s/\n/ /g;
unless ($ignorewarnings) {
print STDERR "\n$dum1\n";
print STDERR " " x 30,"^" x ($pos-$last)," " x 30,"\n";
print STDERR "Missing characters near word " . (scalar @retval) . " character index: " . $last . "-" . pos($string) . " Length: " . length($match) . " Match: |$match| (expected match marked above).\n";
}
# put in missing characters `by hand'
push (@retval, substr($dum,$offset,$pos-$last-length($match)));
# Note: there seems to be a bug in substr with utf8 that made the following line output substr which were too long,
# using dum instead appears to work
# push (@retval, substr($string,$last, pos($string)-$last-length($match)));
}
push (@retval, $match);
$last=pos $string;
}
}
unshift(@retval,$leadin) if (length($leadin)>0);
return @retval;
}
# pass1( \@seq1,\@seq2)
# Look for differences between seq1 and seq2.
# Where an common-subsequence block is flanked by deleted or appended blocks,
# and is shorter than $MINWORDSBLOCK words it is appended
# to the last deleted or appended word. If the block contains tokens other than words
# or punctuation it is not merged.
# Deleted or appended block consisting of words and safe commands only are
# also merged, to prevent break-up in pass2 (after previous isolated words have been removed)
# If there are commands with textual arguments (e.g. \caption) both in corresponding
# appended and deleted blocks split them such that the command and opening bracket
# are one token, then the rest is split up following standard rules, and the closing
# bracket is a separate token, ie. turn
# "\caption{This is a textual argument}" into
# ("\caption{","This ","is ","a ","textual ","argument","}")
# No return value. Destructively changes sequences
sub pass1 {
my $seq1 = shift ;
my $seq2 = shift ;
my $len1 = scalar @$seq1;
my $len2 = scalar @$seq2;
my $wpat=qr/^(?:[a-zA-Z.,'`:;?()!]*)[\s~]*$/; #'
my ($last1,$last2)=(-1,-1) ;
my $cnt=0;
my $block=[];
my $addblock=[];
my $delblock=[];
my $todo=[];
my $instruction=[];
my $i;
my (@delmid,@addmid,@dummy);
my ($addcmds,$delcmds,$matchindex);
my ($addtextblocks,$deltextblocks);
my ($addtokcnt,$deltokcnt,$mattokcnt)=(0,0,0);
my ($addblkcnt,$delblkcnt,$matblkcnt)=(0,0,0);
my $adddiscard = sub {
if ($cnt > 0 ) {
$matblkcnt++;
# just after an unchanged block
# print STDERR "Unchanged block $cnt, $last1,$last2 \n";
if ($cnt < $MINWORDSBLOCK
&& $cnt==scalar (
grep { /^$wpat/ || ( /^\\((?:[`'^"~=.]|[\w\d@*]+))((?:\[$brat_n\]|\{$pat_n\})*)/o
&& iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL)
&& scalar(@dummy=split(" ",$2))<3 ) }
@$block) ) {
# merge identical blocks shorter than $MINWORDSBLOCK
# and only containing ordinary words
# with preceding different word
# We cannot carry out this merging immediately as this
# would change the index numbers of seq1 and seq2 and confuse
# the algorithm, instead we store in @$todo where we have to merge
push(@$todo, [ $last1,$last2,$cnt,@$block ]);
}
$block = [];
$cnt=0; $last1=-1; $last2=-1;
}
};
my $discard=sub { $deltokcnt++;
&$adddiscard; #($_[0],$_[1]);
push(@$delblock,[ $seq1->[$_[0]],$_[0] ]);
$last1=$_[0] };
my $add = sub { $addtokcnt++;
&$adddiscard; #($_[0],$_[1]);
push(@$addblock,[ $seq2->[$_[1]],$_[1] ]);
$last2=$_[1] };
my $match = sub { $mattokcnt++;
if ($cnt==0) { # first word of matching sequence after changed sequence or at beginning of word sequence
$deltextblocks = extracttextblocks($delblock);
$delblkcnt++ if scalar @$delblock;
$addtextblocks = extracttextblocks($addblock);
$addblkcnt++ if scalar @$addblock;
# make a list of all TEXTCMDLIST commands in deleted and added blocks
$delcmds = extractcommands($delblock);
$addcmds = extractcommands($addblock);
# now find those text commands, which are found in both deleted and added blocks, and expand them
# keygen(third argument of _longestCommonSubsequence) implies to sort on command (0th elements of $addcmd elements)
# the calling format for longestCommonSubsequence has changed between versions of
# Algorithm::Diff so we need to check which one we are using
if ( $algodiffversion > 1.15 ) {
### Algorithm::Diff 1.19
$matchindex=Algorithm::Diff::_longestCommonSubsequence($delcmds,$addcmds, 0, sub { $_[0]->[0] } );
} else {
### Algorithm::Diff 1.15
$matchindex=Algorithm::Diff::_longestCommonSubsequence($delcmds,$addcmds, sub { $_[0]->[0] } );
}
for ($i=0 ; $i<=$#$matchindex ; $i++) {
if (defined($matchindex->[$i])){
$j=$matchindex->[$i];
@delmid=splitlatex($delcmds->[$i][3]);
@addmid=splitlatex($addcmds->[$j][3]);
while (scalar(@$deltextblocks) && $deltextblocks->[0][0]<$delcmds->[$i][1]) {
my ($index,$block,$cnt)=@{ shift(@$deltextblocks) };
push(@$todo, [$index,-1,$cnt,@$block]);
}
push(@$todo, [ $delcmds->[$i][1],-1,-1,$delcmds->[$i][2],@delmid,$delcmds->[$i][4]]);
while (scalar(@$addtextblocks) && $addtextblocks->[0][0]<$addcmds->[$j][1]) {
my ($index,$block,$cnt)=@{ shift(@$addtextblocks) };
push(@$todo, [-1,$index,$cnt,@$block]);
}
push(@$todo, [ -1,$addcmds->[$j][1],-1,$addcmds->[$j][2],@addmid,$addcmds->[$j][4]]);
}
}
# mop up remaining textblocks
while (scalar(@$deltextblocks)) {
my ($index,$block,$cnt)=@{ shift(@$deltextblocks) } ;
push(@$todo, [$index,-1,$cnt,@$block]);
}
while (scalar(@$addtextblocks)) {
my ($index,$block,$cnt)=@{ shift(@$addtextblocks) };
push(@$todo, [-1,$index,$cnt,@$block]);
}
$addblock=[];
$delblock=[];
}
push(@$block,$seq2->[$_[1]]);
$cnt++ };
my $keyfunc = sub { join(" ",split(" ",shift())) };
traverse_sequences($seq1,$seq2, { MATCH=>$match, DISCARD_A=>$discard, DISCARD_B=>$add }, $keyfunc );
# now carry out the merging/splitting. Refer to elements relative from
# the end (with negative indices) as these offsets don't change before the instruction is executed
# cnt>0: merged small unchanged groups with previous changed blocks
# cnt==-1: split textual commands into components
foreach $instruction ( @$todo) {
($last1,$last2,$cnt,@$block)=@$instruction ;
if ($cnt>=0) {
splice(@$seq1,$last1-$len1,1+$cnt,join("",$seq1->[$last1-$len1],@$block)) if $last1>=0;
splice(@$seq2,$last2-$len2,1+$cnt,join("",$seq2->[$last2-$len2],@$block)) if $last2>=0;
} else {
splice(@$seq1,$last1-$len1,1,@$block) if $last1>=0;
splice(@$seq2,$last2-$len2,1,@$block) if $last2>=0;
}
}
if ($verbose) {
print STDERR "\n";
print STDERR " $mattokcnt matching tokens in $matblkcnt blocks.\n";
print STDERR " $deltokcnt discarded tokens in $delblkcnt blocks.\n";
print STDERR " $addtokcnt appended tokens in $addblkcnt blocks.\n";
}
}
# extracttextblocks(\@blockindex)
# $blockindex has the following format
# [ [ token1, index1 ], [token2, index2],.. ]
# where index refers to the index in the original old or new word sequence
# Returns: reference to an array of the form
# [[ $index, $textblock, $cnt ], ..
# where $index index of block to be merged
# $textblock contains all the words to be merged with the word at $index (but does not contain this word)
# $cnt is length of block
#
# requires: iscmd
#
sub extracttextblocks {
my $block=shift;
my ($i,$token,$index);
my $textblock=[];
my $last=-1;
my $wpat=qr/^(?:[a-zA-Z.,'`:;?()!]*)[\s~]*$/; #'
my $retval=[];
# we redefine locally $extraspace (shadowing the global definition) to capture command sequences with intervening spaces no matter what the global setting
# this is done so we can capture those commands with a predefined number of arguments without having to introduce them again explicitly here
my $extraspace='\s*';
for ($i=0;$i< scalar @$block;$i++) {
($token,$index)=@{ $block->[$i] };
# store pure text blocks
if ($token =~ /$wpat/ || ( $token =~/^\\((?:[`'^"~=.]|[\w\d@\*]+))((?:${extraspace}\[$brat_n\]${extraspace}|${extraspace}\{$pat_n\})*)/
&& iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL)
&& !iscmd($1,\@TEXTCMDLIST,\@TEXTCMDEXCL))) {
# we have text or a command which can be treated as text
if ($last<0) {
# new pure-text block
$last=$index;
} else {
# add to pure-text block
push(@$textblock, $token);
}
} else {
# it is not text
if (scalar(@$textblock)) {
push(@$retval,[ $last, $textblock, scalar(@$textblock) ]);
}
$textblock=[];
$last=-1;
}
}
# finish processing a possibly unfinished block before returning
if (scalar(@$textblock)) {
push(@$retval,[ $last, $textblock, scalar(@$textblock) ]);
}
return($retval)
}
# extractcommands( \@blockindex )
# $blockindex has the following format
# [ [ token1, index1 ], [token2, index2],.. ]
# where index refers to the index in the original old or new word sequence
# Returns: reference to an array of the form
# [ [ "\cmd1", index, "\cmd1[optarg]{arg1}{", "arg2" ,"} " ],..
# where index is just taken from input array
# command must have a textual argument as last argument
#
# requires: iscmd
#
sub extractcommands {
my $block=shift;
my ($i,$token,$index,$cmd,$open,$mid,$closing);
my $retval=[];
# we redefine locally $extraspace (shadowing the global definition) to capture command sequences with intervening spaces no matter what the global setting
# this is done so we can capture those commands with a predefined number of arguments without having to introduce them again explicitly here
my $extraspace='\s*';
for ($i=0;$i< scalar @$block;$i++) {
($token,$index)=@{ $block->[$i] };
# check if token is an alphanumeric command sequence with at least one non-optional argument
# \cmd[...]{...}{last argument}
# Capturing in the following results in these associations
# $1: \cmd[...]{...}{
# $2: \cmd
# $3: last argument
# $4: } + trailing spaces
if ( ( $token =~ m/^(\\([\w\d\*]+)(?:${extraspace}\[$brat_n\]|${extraspace}\{$pat_n\})*${extraspace}\{)($pat_n)(\}\s*)$/so )
&& iscmd($2,\@TEXTCMDLIST,\@TEXTCMDEXCL) ) {
print STDERR "DEBUG EXTRACTCOMMANDS Match |$1|$2|$3|$4|$index \n" if $debug;
# push(@$retval,[ $2,$index,$1,$3,$4 ]);
($cmd,$open,$mid,$closing) = ($2,$1,$3,$4) ;
$closing =~ s/\}/\\RIGHTBRACE/ ;
push(@$retval,[ $cmd,$index,$open,$mid,$closing ]);
}
}
return $retval;
}
# iscmd($cmd,\@regexarray,\@regexexcl) checks
# return 1 if $cmd matches any of the patterns in the
# array $@regexarray, and none of the patterns in \@regexexcl, otherwise return 0
sub iscmd {
my ($cmd,$regexar,$regexexcl)=@_;
my ($ret)=0;
### print STDERR "DEBUG: iscmd($cmd)=" if $debug;
foreach $pat ( @$regexar ) {
if ( $cmd =~ m/^${pat}$/ ) {
$ret=1 ;
last;
}
}
### print STDERR "0\n" if ($debug && !$ret) ;
return 0 unless $ret;
foreach $pat ( @$regexexcl ) {
### print STDERR "0\n" if ( $debug && $cmd =~ m/^${pat}$/) ;
return 0 if ( $cmd =~ m/^${pat}$/ );
}
### print STDERR "1\n" if $debug;
return 1;
}
# pass2( \@seq1,\@seq2)
# Look for differences between seq1 and seq2.
# Mark begin and end of deleted and appended sequences with tags $DELOPEN and $DELCLOSE
# and $ADDOPEN and $ADDCLOSE, respectively, however exclude { } & and all comands, unless
# they match an element of the whitelist (SAFECMD)
# For words in TEXTCMD but not in SAFECMD, enclose interior with $ADDOPEN and $ADDCLOSE brackets
# Deleted comment lines are marked with %DIF <
# Added comment lines are marked with %DIF >
sub pass2 {
my $seq1 = shift ;
my $seq2 = shift ;
my ($addtokcnt,$deltokcnt,$mattokcnt)=(0,0,0);
my ($addblkcnt,$delblkcnt,$matblkcnt)=(0,0,0);
my $retval = [];
my $delhunk = [];
my $addhunk = [];
my $discard = sub { $deltokcnt++;
push ( @$delhunk, $seq1->[$_[0]]) };
my $add = sub { $addtokcnt++;
push ( @$addhunk, $seq2->[$_[1]]) };
my $match = sub { $mattokcnt++;
if ( scalar @$delhunk ) {
$delblkcnt++;
# mark up changes, but comment out commands
push @$retval,marktags($DELMARKOPEN,$DELMARKCLOSE,$DELOPEN,$DELCLOSE,$DELCMDOPEN,$DELCMDCLOSE,$DELCOMMENT,$delhunk);
$delhunk = [];
}
if ( scalar @$addhunk ) {
$addblkcnt++;
# we mark up changes, but simply quote commands
push @$retval,marktags($ADDMARKOPEN,$ADDMARKCLOSE,$ADDOPEN,$ADDCLOSE,"","",$ADDCOMMENT,$addhunk);
$addhunk = [];
}
push(@$retval,$seq2->[$_[1]]) };
my $keyfunc = sub { join(" ",split(" ",shift())) };
traverse_sequences($seq1,$seq2, { MATCH=>$match, DISCARD_A=>$discard, DISCARD_B=>$add }, $keyfunc );
# clear up unprocessed hunks
push @$retval,marktags($DELMARKOPEN,$DELMARKCLOSE,$DELOPEN,$DELCLOSE,$DELCMDOPEN,$DELCMDCLOSE,$DELCOMMENT,$delhunk) if scalar @$delhunk;
push @$retval,marktags($ADDMARKOPEN,$ADDMARKCLOSE,$ADDOPEN,$ADDCLOSE,"","",$ADDCOMMENT,$addhunk) if scalar @$addhunk;
if ($verbose) {
print STDERR "\n";
print STDERR " $mattokcnt matching tokens. \n";
print STDERR " $deltokcnt discarded tokens in $delblkcnt blocks.\n";
print STDERR " $addtokcnt appended tokens in $addblkcnt blocks.\n";
}
return(@$retval);
}
# marktags($openmark,$closemark,$open,$close,$opencmd,$closecmd,$comment,\@block)
# returns ($openmark,$open,$block,$close,$closemark) if @block contains no commands (except white-listed ones),
# braces, ampersands, or comments
# mark comments with $comment
# exclude all other exceptions from scope of open, close like this
# ($openmark, $open,...,$close, $opencmd,command, command,$closecmd, $open, ..., $close, $closemark)
# If $opencmd begins with "%" marktags assumes it is operating on a deleted block, otherwise on an added block
sub marktags {
my ($openmark,$closemark,$open,$close,$opencmd,$closecmd,$comment,$block)=@_;
my $word;
my (@argtext);
my $retval=[];
my $noncomment=0;
my $cmd=-1; # -1 at beginning 0: last token written is a ordinary word
# 1: last token written is a command
# for keeping track whether we are just in a command sequence or in a word sequence
my $cmdcomment= ($opencmd =~ m/^%/); # Flag to indicate whether opencmd is a comment (i.e. if we intend to simply comment out changed commands)
my ($command,$commandword,$closingbracket) ; # temporary variables needed below to remember sub-pattern matches
# split this block to split sequences joined in pass1
### print STDERR "DEBUG: marktags before splitlatex blocksplit ",join("|",@$block),"\n" if $debug;
@$block=splitlatex(join "",@$block);
### print STDERR "DEBUG: marktags $openmark,$closemark,$open,$close,$opencmd,$closecmd,$comment\n" if $debug;
print STDERR "DEBUG: after splitlatex ",join("|",@$block),"\n" if $debug;
# we redefine locally $extraspace (shadowing the global definition) to capture command sequences with intervening spaces no matter what the global setting
# this is done so we can capture those commands with a predefined number of arguments without having to introduce them again explicitly here
my $extraspace_mt='\s*';
foreach (@$block) {
$word=$_;
if ( $word =~ s/^%/%$comment/ ) {
# a comment
if ($cmd==1) {
push (@$retval,$closecmd) ;
$cmd=-1;
}
push (@$retval,$word);
next;
}
if ( $word =~ m/^\s*$/ ) {
### print STDERR "DEBUG MARKTAGS: whitespace detected |$word| cmdcom |$cmdcomment| |$opencmd|\n" if $debug;
# a sequence of white-space characters - this should only ever happen for the first element of block.
# in deleted block, omit, otherwise just copy it in
if ( ! $cmdcomment) { # ignore in deleted blocks
push(@$retval,$word);
}
next;
}
if (! $noncomment) {
push (@$retval,$openmark);
$noncomment=1;
}
# negative lookahead pattern (?!) in second clause is put in to avoid matching \( .. \) patterns
# also note that second pattern will match \\
if ( $word =~ /^[&{}\[\]]/ || ( $word =~ /^\\(?!\()(\\|[`'^"~=.]|[\w*@]+)/ && !iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL)) ) {
### if ( $word =~ /^[&{}\[\]]/ || ( $word =~ /^\\([\w*@\\% ]+)/ && !iscmd($1,\@SAFECMDLIST,\@SAFECMDEXCL)) ) {
# word is a command or other significant token (not in SAFECMDLIST)
## same conditions as in subroutine extractcommand:
# check if token is an alphanumeric command sequence with at least one non-optional argument
# \cmd[...]{...}{last argument}
# Capturing in the following results in these associations
# $1: \cmd[...]{...}{
# $2: cmd
# $3: last argument
# $4: } + trailing spaces
### pre-0.3 if ( ( $token =~ m/^(\\([\w\d\*]+)(?:\[$brat0\]|\{$pat_n\})*\{)($pat_n)(\}\s*)$/so )
if ( ( $word =~ m/^(\\([\w\d\*]+)(?:${extraspace_mt}\[$brat_n\]|${extraspace_mt}\{$pat_n\})*${extraspace_mt}\{)($pat_n)(\}\s*)$/so )
&& (iscmd($2,\@TEXTCMDLIST,\@TEXTCMDEXCL)|| iscmd($2,\@MATHTEXTCMDLIST,\@MATHTEXTCMDEXCL))
&& ( !$cmdcomment || !iscmd($2,\@CONTEXT2CMDLIST, \@CONTEXT2CMDEXCL) ) ) {
# Condition 1: word is a command? - if yes, $1,$2,.. will be set as above
# Condition 2: word is a text command - we mark up the interior of the word. There is a separate check for MATHTEXTCMDLIST
# because for $mathmarkup=WHOLE, the commands should not be split in pass1 (ie. math mode commands are not in
# TEXTCMDLIST, but the interior of MATHTEXT commnds should be highlighted in both deleted and added blocks
# Condition 3: But if we are in a deleted block ($cmdcomment=1) and
# $2 (the command) is in context2, just treat it as an ordinary command (i.e. comment it open with $opencmd)
# Because we do not want to disable this command
# here we do not use $opencmd and $closecmd($opencmd is empty)
print STDERR "DEBUG: Detected text |$word| but not safe command \$2: $2 \$3: $3\n." if $debug;
if ($cmd==1) {
push (@$retval,$closecmd) ;
} elsif ($cmd==0) {
push (@$retval,$close) ;
}
$command=$1; $commandword=$2; $closingbracket=$4;
@argtext=splitlatex($3); # split textual argument into tokens
# and mark it up (but we do not need openmark and closemark)
# insert command with initial arguments, marked-up final argument, and closing bracket
if ( $cmdcomment && iscmd($commandword,\@CONTEXT1CMDLIST, \@CONTEXT1CMDEXCL) ) {
# context1cmd in a deleted environment; delete command itself but keep last argument, marked up
push (@$retval,$opencmd);
$command =~ s/\n/\n${opencmd}/sg ; # repeat opencmd at the beginning of each line
# argument, note that the additional comment character is included
# to suppress linebreak after opening parentheses, which is important
# for latexrevise
push (@$retval,$command,"%\n{$AUXCMD\n",marktags("","",$open,$close,$opencmd,$closecmd,$comment,\@argtext),$closingbracket);
} elsif ( iscmd($commandword,,\@MATHTEXTCMDLIST, \@MATHTEXTCMDEXCL) ) {
# MATHBLOCK pseudo command: consider all commands safe, except & and \\, \begin and \end and a few package sprcific one (look at UNSAFEMATHCMD definition)
# Keep these commands even in deleted blocks, hence set $opencmd and $closecmd (5th and 6th argument of marktags) to
# ""
local @SAFECMDLIST=(".*");
local @SAFECMDEXCL=('\\','\\\\',@UNSAFEMATHCMD);
push(@$retval,$command,marktags("","",$open,$close,"","",$comment,\@argtext)#@argtext
,$closingbracket);
} else {
# normal textcmd or context1cmd in an added block
push (@$retval,$command,marktags("","",$open,$close,$opencmd,$closecmd,$comment,\@argtext),$closingbracket);
}
push (@$retval,$AUXCMD,"\n") if $cmdcomment ;
$cmd=-1 ;
} elsif ( $cmdcomment &&
( $word =~ m/^(\\([\w\d\*]+)(?:${extraspace_mt}\[$brat_n\]|${extraspace_mt}\{$pat_n\})*${extraspace_mt}\{)($pat_n)(\}\s*)/so )
&& iscmd($2,\@KEEPCMDLIST, \@KEEPCMDEXCL) ) {
# 'keepcmd' in a deleted environment: keep the command as is
push (@$retval,$close) if $cmd==0 ;
push (@$retval,$word);
$cmd=-1; # pretend we are at the beginning of a sequence because we do not want to add an additional $closecmd or $close before the next token, no matter what it is
} else {
# ordinary command
push (@$retval,$opencmd) if $cmd==-1 ;
push (@$retval,$close,$opencmd) if $cmd==0 ;
$word =~ s/\n/\n${opencmd}/sg if $cmdcomment ; # if opencmd is a comment, repeat this at the beginning of every line
### print STDERR "MARKTAGS: Add command |$word|\n";
push (@$retval,$word);
$cmd=1;
}
} else {
###print STDERR "DEBUG MARKTAGS is an ordinary word or SAFECMD command \n" if $debug;
# just an ordinary word or command in SAFECMD
push (@$retval,$open) if $cmd==-1 ;
push (@$retval,$closecmd,$open) if $cmd==1 ;
###TODO: check here if it is a command in MBOXCMD list, and surround it with \mbox{...}
### $word =~ /^\\(?!\()(\\|[`'^"~=.]|[\w*@]+)/ && iscmd($1,\@MBOXCMDLIST,\@MBOXCMDEXCL))
### but actually this check has been carried out already so can simply check if word begins with backslash
if ( $word =~ /^\\(?!\()(\\|[`'^"~=.]|[\w*@]+)(.*?)(\s*)$/s && iscmd($1,\@MBOXCMDLIST,\@MBOXCMDEXCL)) {
# $word is a safe command in MBOXCMDLIST
###print STDERR "DEBUG Mboxsafecmd detected:$word:\n" if $debug ;
push(@$retval,"\\mbox{$AUXCMD\n\\" . $1 . $2 . $3 ."}\\hskip0pt$AUXCMD\n" );
} else {
# $word is a normal word or a safe command (not in MBOXCMDLIST)
push (@$retval,$word);
}
$cmd=0;
}
}
push (@$retval,$close) if $cmd==0;
push (@$retval,$closecmd) if $cmd==1;
push (@$retval,$closemark) if ($noncomment);
return @$retval;
}
#used in preprocess
sub take_comments_and_newline_from_frac() {
# some special magic for common usage of frac, which does not conform to the latexdiff requirements but can be made to fit
# note that this is a rare exception to the general rule that the new tex can be reconstructed from the diff file
# regex that matches space and comment characters
my $space = qr/\s|%[^\n]*?/;
# \frac {abc} -> \frac{abc}
# \frac1 -> \frac{1}
# \frac a -> \frac{a}
# \frac \lambda -> \frac{\lambda}
s/\\frac(?|${space}+\{($pat_n)\}|${space}*(\d)|${space}+(\w)|${space}*(\\[a-zA-Z]+))/\\frac\{$1\}/g;
# same as above for the second argument of frac
s/\\frac(\{$pat_n\})(?|${space}*\{($pat_n)\}|${space}*(\d)|${space}+(\w)|${space}*(\\[a-zA-Z]+))/\\frac$1\{$2\}/g;
}
# preprocess($string, ..)
# carry out the following pre-processing steps for all arguments:
# 1. Remove leading white-space
# Change \{ to \QLEFTBRACE and \} to \QRIGHTBRACE and \& to \AMPERSAND
# #. Change {,},\frac in comments to \CLEFTBRACE, \CRIGHTBRACE, \CFRAC
# 2. mark all first empty line (in block of several) with \PAR tokens
# 3. Convert all '\%' into '\PERCENTAGE ' and all '\$' into \DOLLAR to make parsing regular expressions easier
# 4. Convert all \verb|some verbatim text| commands (where | can be an arbitrary character)
# into \verb{hash} (also lstinline)
# 5. Convert \begin{verbatim} some verbatim text \end{verbatim} into \verbatim{hash} (not only verbatim, all patterns matching VERBATIMENV)
# 6. Convert _n into \SUBSCRIPTNB{n} and _{nnn} into \SUBSCRIPT{nn}
# 7. Convert ^n into \SUPERSCRIPTNB{n} and ^{nnn} into \SUPERSCRIPT{nn}
# 8. a. Convert $$ $$ into \begin{DOLLARDOLLAR} \end{DOLLARDOLLAR}
# b. Convert \[ \] into \begin{SQUAREBRACKET} \end{SQUAREBRACKET}
# 9. Convert all picture environmentent (\begin{PICTUREENV} .. \end{PICTUREENV} \PICTUREBLOCKenv
# For math-mode COARSE,WHOLE or NONE option -convert all \begin{MATH} .. \end{MATH}
# into \MATHBLOCKmath{...} commands, where MATH/math is any valid math environment
# 10. Add final token STOP to the very end. This is put in because the algorithm works better if the last token is identical. This is removed again in postprocessing.
#
# NB: step 6 and 7 is likely to convert some "_" inappropriately, e.g. in file
# names or labels but it does not matter because they are converted back in the postprocessing step
# Returns: leading white space removed in step 1
sub preprocess {
for (@_) {
# change in \verb and similar commands - note that I introduce an extra space here so that the
# already hashed variants do not trigger again
# transform \lstinline{...}
# s/\\lstinline(\[$brat0\])?(\{(?:.*?)\})/"\\DIFlstinline". $1 ."{". tohash(\%verbhash,"$2") ."}"/esg;
# s/\\lstinline(\[$brat0\])?((\S).*?\2)/"\\DIFlstinline". $1 ."{". tohash(\%verbhash,"$2") ."}"/esg;
s/\\lstinline((?:\[$brat_n\])?)(\{(?:.*?)\})/"\\DIFlstinline". $1 ."{". tohash(\%verbhash,"$2") ."}"/esg;
s/\\lstinline((?:\[$brat_n\])?)(([^\s\w]).*?\3)/"\\DIFlstinline". $1 ."{". tohash(\%verbhash,"$2") ."}"/esg;
s/\\(verb\*?|lstinline)([^\s\w])(.*?)\2/"\\DIF${1}{". tohash(\%verbhash,"${2}${3}${2}") ."}"/esg;
# Change \{ to \QLEFTBRACE, \} to \QRIGHTBRACE, and \& to \AMPERSAND
s/(? brackets
# Example: \begin{alignat}{3} ... \end{alignat} will turn into \MATHBLOCKalignat[{3}]{ ... }
if ( $mathmarkup != FINE ) {
# DIFANCHORARRB and DIFANCHORARRE, DIFANCHORMATHB and DIFANCHORMATHE markers are inserted here to encourage the matching algorithm
# to always match up the closing brace. Otherwise sometimes one ends up with a situation where
# the closing brace is deleted and added at another point. The deleted closing brace is then
# prevented by a %DIFDELCMD, leading to material leaking in or out of the math environment.
# The anchors are removed in post-processing again. (note that they are simple text to cause least amount of complications
# Admittedly, this is something of a hack and will not always work. If it does not, then one needs to
# resort to WHOLE or FINE, or NONE math mode processing.
s/\\begin\{($ARRENV)}(.*?)\\end\{\1}/\\ARRAYBLOCK$1\{$2\\DIFANCHORARRB \}\\DIFANCHORARRE /sg;
take_comments_and_newline_from_frac();
# Convert Math environments with arguments
s/\\begin\{($MATHENV|$MATHARRENV|SQUAREBRACKET)\}((?:\[$brat_n\])|(?:\{$pat_n\}))+(.*?)\\end\{\1\}/\\MATHBLOCK$1\[$2\]\{$3\\DIFANCHORMATHB \}\\DIFANCHORMATHE /sg;
# Convert Math environments without arguments
s/\\begin\{($MATHENV|$MATHARRENV|SQUAREBRACKET)\}(.*?)\\end\{\1\}/\\MATHBLOCK$1\{$2\\DIFANCHORMATHB \}\\DIFANCHORMATHE /sg;
}
# add final token " STOP"
$_ .= " STOP"
}
}
# $expanded=linecomment($string)
#preface all lines with verbatim marker (usually DIFVRB)
sub linecomment {
my @verbatimlines=split("\n",$_[0]);
# the first line needs special treatment - we do want to retain optional arguments as is but wrap the remainder also with VERBCOMMENT
### print STDERR "DEBUG: before verbatimlines[0] = ",$verbatimlines[0],"\n";
$verbatimlines[0]=~s/^((?:\s*\[$brat_n\])?\s*)([^\s\[].*)/ defined($2) ? ( "$1\%$VERBCOMMENT$2" ) : ( $1 )/e;
### print STDERR "DEBUG: after verbatimlines[0] = ",$verbatimlines[0],"\n";
return(join("\n%$VERBCOMMENT",@verbatimlines)."\n");
}
# $simple=reverselinecomment($env $string)
# remove DIFVRB comments but leave changed lines marked
sub reverselinecomment {
my ($environment, $verbatimtext)=@_;
### print STDERR "DEBUG REVERSELINECOMMENT input: $environment,|$verbatimtext|\n" if $debug;
# remove markup added by latexdiff
# (this should occur only if the type of verbatim environment was changed)
# (note that this destroys some information in old file)
# in theory I could save it by moving it out of the verbatim environment
# but this requires more bookkeeping and is probably not necessary)
$verbatimtext =~ s/\\DIFaddbegin //g;
$verbatimtext =~ s/\\DIFaddend //g;
$verbatimtext =~ s/\\DIFdelbegin //g;
$verbatimtext =~ s/\\DIFdelend //g;
$verbatimtext =~ s/$DELCMDOPEN.*//g;
# remove DIFVRB mark
$verbatimtext=~ s/%$VERBCOMMENT//g;
# remove part of the markup in changed lines
# if any of these substitution was made, then there was at least
# one changed line, and we have to extend the style
if ( $verbatimtext=~ s/$VERBCOMMENT//g ) {
# in the next line we add ~alsolanguage~ modifier, but also deletes the rest of the line after the optional argument, as lstlisting commands gets sometimes
# very confused by what is there (and othertimes seems to ignore this anyway)
unless ( $verbatimtext =~ s/^(\s*)\[($brat_n)\](.*)\n/$1\[$2,alsolanguage=DIFcode\]\n/ ) {
if ( $verbatimtext =~ m/^\s*\n/ ) {
$verbatimtext = "[alsolanguage=DIFcode]" . $verbatimtext;
} else {
$verbatimtext = "[alsolanguage=DIFcode]\n" . $verbatimtext;
}
}
# There is a bug in listings package (at least v1.5b) for empty comments where the actual comment command is not made invisible
# I therefore have to introduce an artificial '-' character at the end of empty added or deleted lines
$verbatimtext =~ s/($DELCOMMENT\s*)$/$1-/mg;
$verbatimtext = "\\DIFmodbegin\n\\begin{${environment}}${verbatimtext}\\end{${environment}}\n\\DIFmodend"
} else {
$verbatimtext = "\\begin{${environment}}${verbatimtext}\\end{${environment}}"
}
### print STDERR "DEBUG REVERSELINECOMMENT output: |$verbatimtext|\n" if $debug;
return($verbatimtext);
}
#hashstring=tohash(\%hash,$string)
# creates a hash value based on string and stores in %hash
sub tohash {
my ($hash,$string)=@_;
my (@arr,$val);
my ($sum,$i)=(0,1);
my ($hstr);
@arr=unpack('c*',$string);
while (1) {
foreach $val (@arr) {
$sum += $i*$val;
$i++;
}
$hstr= "$sum";
last unless (defined($hash->{$hstr}) && $string ne $hash->{$hstr});
# else found a duplicate HASH need to repeat for a higher hash value
}
$hash->{$hstr}=$string;
### print STDERR "Hash:$hstr: Content:$string:\n";
return($hstr);
}
#string=fromhash(\%hash,$fromstring)
# restores string value stored in hash
#string=fromhash(\%hash,$fromstring,$prependstring)
# additionally begins each line with prependstring
sub fromhash {
my ($hash,$hstr)=($_[0],$_[1]);
my $retstr=$hash->{$hstr};
if ( $#_ >= 2) {
$retstr =~ s/^/$_[2]/mg;
}
return $retstr;
}
# stripdelcmpopen(string)
# return string with $DELCMDOPEN removed
sub stripdelcmdopen {
my ($str) = $_[0];
$str =~ s/${DELCMDOPEN}//mg;
return $str;
}
# writedebugfile(string, label)
# if $debug set writes to file latexdiff.debug.