Article: 4225 of comp.lang.perl
Xref: feenix.metronet.com comp.lang.perl:4225
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!ames!olivea!apple.com!voder!berlioz.nsc.com!jedi!arielf
From: arielf@mirage.nsc.com (Ariel Faigon)
Subject: ckbal - check balanced tokens in C source (perl script)
Message-ID: <1993Jul14.202605.13360@berlioz.nsc.com>
Followup-To: comp.lang.perl
Keywords: Reiser-cpp tokens balance C-language-tool
Sender: news@berlioz.nsc.com (UseNet News account)
Reply-To: arielf@mirage.nsc.com
Organization: National Semiconductor Corp.
Date: Wed, 14 Jul 1993 20:26:05 GMT
Lines: 345

Have you ever been frustrated by the Reiser-cpp or pcc insufficient data
in error messages? Have you ever got a message like "1073: missing #endif"
without a reference to the line where the opening #if appears. Have you ever
left out a closing comment and searched your source for the error in the wrong
place? If so, 'ckbal' can help you.
I wrote this about a year ago in an evening of such frustration, and just thought
it might be a good idea to post this for the benefit of all.

Sorry, the man page is not wrapped, and it doesn't handle C++ style comments.
yet it saved me and my colleagues many long hours in the past year or so. Enjoy.

#!/usr/local/bin/perl 
# On Messy-DOS systems start with the following 3 line prologue:
@REM=(qq!
@perl  %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9
@goto end !) if 0 ;
 
#
# ckbal - check balanced tokens in C source.
#
# Usage:
#	ckbal [-d] [-w] C-files...
#
#       checked token-pairs include
#               ( )
#               [ ]
#               { }
#               /* */
#               " "
#               ' '
#		#ifxxx [#els...] #endif
#
# Errors detected:
#       o EOF hit with unclosed token (opened on line X)
#       o Closing token on line X has no matching open token
#
# Options:
#	-w	Adds warnings about potential errors:
#       	Unmatched token pairs within strings or comments
#       	(e.g. Open comment within a comment)
#
# 	-d	Adds debugging printouts
#
# States during parsing:
#       o In comment
#       o In string
#       o In char constant
#       o In code
#
# TODO:
#	o Make it simpler - more general, more table driven
#	  but for a first shot it serves its purpose...
#	o Add C++, // style comments
# BUGS:
#	o A character (single quoted) string is arbitrary long
#	o There are cases where only '-w' will spot the problem
#
# Author: Ariel Faigon, (arielf@mirage.nsc.com), May 13, 1992
# Donated to the public domain - please leave author data intact. Enjoy.
#

# I use eval because it may not work on my DOS, your mileage may vary.
eval "require 'getopts.pl'";
unless ($@) {
	&Getopts('dw');
}

# -- Opening -> closing token mapping
%pairof = (
        '(', ')',
        '[', ']',
        '{', '}',
        '/*', '*/',
        '"', '"',
        "'", "'",
	'#if',     '#endif',
	'#ifdef',  '#endif',
	'#ifndef', '#endif',
	
);
 
# -- State transitions: state + token -> new-state
%next_state = ( "code$;/*", 'comment',
                "code$;\"", 'string',	# " close quote to keep 4dos happy
                "code$;'", 'char',
                "comment$;*/", 'code',
                "string$;\"", 'code',	# " close quote to keep 4dos happy
                "char$;'", 'code'
);

# -- single-char-codes -> real-tokens
# text is 'canonicalized' using single char codes for convenience + efficiency
%realtok_of = (
	"\201", '/*',
	"\202", '*/',

	"\211", '#if', 
	"\212", '#ifdef',
	"\213", '#ifndef',
	"\214", '#else',
	"\215", '#elsif',
	"\216", '#endif'
);

# Given current state and token, determine if a token is "opening"
# Should probably be another assoc table...
sub opening {
    local($state, $tok) = @_;
    ($tok =~ m,[\[({]|/\*, || $state eq 'code' && $tok =~ /["']/ ||
     $tok =~ /^#if/)
	    ? 1 : 0;
}
 
# Given current state and token determine if a token is "closing"
# Should probably be another assoc table...
sub closing {
    local($state, $tok) = @_;
    ($tok =~ m,[])}]|\*/, || $tok eq "'" && $state eq 'char' ||
     $tok eq '"' && $state eq 'string' || $tok eq '#endif')
	? 1 : 0;
}

# -- pop and cleanup all remaining 'unclosed' tokens from stack
sub cleanup_pop {
    local(*stack) = @_;
    local($tok, $line, $prefix);

    $prefix = '';
    unless ($state eq 'code' || $at_eof) {
	$prefix = "warning (in $state): ";
    }
    while (@stack) {
	$tok = pop(@stack);
	$line = pop(@stack);
	if ($opt_w || !$prefix) {
	    print STDERR "$0: $prefix$file, $line: '$tok': open without close\n";
	}
    }
}

$0 =~ s,.*/,,;			# Trim pathname for error messages
unless (@ARGV) {
    print STDERR "ckbal: check balanced tokens in C source

Usage:
	ckbal [-d] [-w] C-files...

Checked token-pairs include:
		( ), [ ], { }, /* */, \" \", ' '
		#ifxxx [#els...] #endif
Options:
	-w	Adds warnings about potential errors:
		Unmatched token pairs within strings or comments
		(e.g. Open comment within a comment)

 	-d	Adds debugging printouts
";
	exit(1);
}

# -- main program
foreach $file (@ARGV) {
    &dofile;
}

# -- process one file 
sub dofile {
    local(*stack);
 
    &canonicalize($file);
    if ($opt_d) {
        print "after canonicalize: '@tokens'\n";
    }
 
 
    $line = 1;
    $state = 'code';
    $at_eof = 0;
    while ($tok = shift(@tokens)) {
        if ($tok eq "\n") {
            $line++;
            next;
        }
        # Move to a new state if a transition is needed
        if ($next_state{$state, $tok}) {
            $new_state = $next_state{$state, $tok};
        } else {
	    $new_state = $state;			# default - no change
	}
        if (&opening($state, $tok)) {
	    # --- 'opening' tokens
	    # special case warning:
            if ($opt_w && $state eq 'comment' && $tok eq '/*') {
               print STDERR "$0: warning: $file, $line: open-comment in comment\n";
            }
	    $state = $new_state;
            *stack = $state;
	    $realstack = $state;
	    if ($tok =~ /^#/) {
		push(@cpp, ($line, $tok));
		$realstack = 'cpp';
	    } else {
                push(@stack, ($line, $tok));
	    }
	    if ($opt_d) {
		print STDERR "$0: line $line: pushed $tok on $realstack stack\n";
	    }
        } elsif (&closing($state, $tok)) {
	    # --- 'closing' tokens
	    if ($tok =~ /^#/) {
		if (@cpp) {
		    $pop = pop(@cpp);
		    pop(@cpp);		# get rid of line-no too
		    if ($opt_d) {
		    print STDERR "$0: line $line: popped $pop from cpp stack\n";
		    }
		} else {
		    print STDERR "$0: $file, $line: '$tok': close without open\n";
	    	}
		next;
	    }

            $prev_tok = $stack[$#stack];
            $prev_line = $stack[$#stack-1];
            if ($pairof{$prev_tok} ne $tok) {
		if ($state eq 'string' && $tok eq '"' ||
		    $state eq 'char' && $tok eq "'" ||
		    $state eq 'comment' && $tok eq '*/') {
		    shift(@stack); shift(@stack);	# get rid of opening
		    &cleanup_pop(*stack, $state);
		    $state = $new_state;
		    *stack = $state;
		    next;
		} elsif ($state eq 'code') {
                    print STDERR "$0: $file, $line: '$tok': close without open\n";
                    @string = ();
                    @char = ();
                    @comment = ();
                } else {		# not in code: comment, string or char
                    $prefix = "warning (in $state): ";
                    if ($opt_w) {
                        print STDERR "$0: $prefix$file, $line: '$tok': close without open\n";
                    }
                    if ($state eq 'string' && $tok eq '"') {
                        @string = ();
                    } elsif ($state eq 'comment' && $tok eq '*/') {
                        @comment = ();
                    } elsif ($state eq 'char' && $tok eq "'") {
                        @char = ();
                    }
                }
            } else {     # match found: pop matching token (and line) from stack
                pop(@stack);
                pop(@stack);
		if ($opt_d) {
		    print STDERR "$0: line $line: popped $tok from $state stack\n";
		}
		$state = $new_state;
                *stack = $state;
            }				# found matching open to close token
        }			# open or close token
	elsif ($tok =~ /#els/) {
	    unless (@cpp) {
		print STDERR "$0: $file, $line: '#if' less '$tok'\n";
	    }
	}
    }			# while tokens in input
    $at_eof = 1;
    $eof_header = 0;
    for $st ('code', 'comment', 'string', 'char', 'cpp') {
	eval "*stack = *$st";
	if (@stack) {
	    unless ($eof_header) {
		print STDERR "$0: $file", ': EOF with unclosed tokens:',"\n";
		$eof_header = 1;
	    }
	    &cleanup_pop(*stack, $st);
	}
    }
}
 
#
# canonicalize:
#	Convert input file to a list of tokens while stripping all
#	"uninteresting" tokens from input.
#
#	o Delete every \x to ease string and char constant parsing.
#	o Replace every two-chars interesting tokens (like /* and */)
#	  by a single-char code
#	o Leave paired-tokens (like {}[]) and newlines (to count lines)
#	  deleting all the rest
#
sub canonicalize {
    local($file) = @_;
    local($_, $tok) = ('', '');
 
#    undef $/;                # read all input into one long string
#    $* = 1;                # Allow multi-line pattern matching
 
    @tokens = ();
    open(FH, $file) || die "$0: cannot open '$file' - $!\n";
 
    while (<FH>) {
	@Ltokens = ();
        s/\\[^\n]//g; # Get rid of escape-sequences to simplify string parsing 

	# normalize to one-char tokens to simplify parsing
        s,/\*,\201,g;
        s,\*/,\202,g;

	s,^\s*#\s*if,\211,;
	s,^\s*#\s*ifdef,\212,;
	s,^\s*#\s*ifndef,\213,;
	s,^\s*#\s*else,\214,;
	s,^\s*#\s*elsif,\215,;
	s,^\s*#\s*endif,\216,;

	# Get rid of all chars except 'interesting' chars
        y/()[]{}"'\n\201-\230//cd;

	@Ltokens = split(//, $_);	# convert line to list of tokens
	while ($tok = shift(@Ltokens)) {# translate back to real-tokens
	    if ($realtok_of{$tok}) {
		push(@tokens, $realtok_of{$tok});
	    } else {
		push(@tokens, $tok);
	    }
	}
    } 
    close(FH);
    # @tokens;		# return interesting list of tokens for whole file
}

# 
# Messy-DOS epilogue:
#
@REM=(qq!
:end !) if 0 ;


---
Peace, Ariel
arielf@mirage.nsc.com