webdiff

#!/usr/bin/perl -w

#################################################################################
#
# Webdiff
#
# Compares two HTML pages (current and archive) and outputs a new page based
# on the current page but with the differences between the two pages highlighted.
#
# Copyright (C) 1998  Chew Wei Yih
# Copyright (C) 2004,2005 Baruch Even <baruch@ev-en.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#################################################################################

use Getopt::Long;
use Pod::Usage;

# Initialize parameters
$oldpage    = "";
$curpage    = "";
$outpage    = "";
$hicolor    = "blue";
$asciimarker = 0;
$ignore     = "none";
$ignoreurl  = "none";
$tmin       = 0;
$tmax       = 99999;
$debug      = 0;
$ignoreFile = "ignore.list";
$basedir    = $ENV{HOME} . "/.websec/";

# Parse options
$help = 0;
$man  = 0;
GetOptions(
    "help|?"       => \$help,
    "man"          => \$man,
    "basedir=s"    => \$basedir,
    "archive=s"    => \$oldpage,
    "current=s"    => \$curpage,
    "out=s"        => \$outpage,
    "hicolor=s"    => \$hicolor,
    "asciimarker"  => \$asciimarker,
    "ignore=s"     => \$ignore,
    "ignoreurl=s"  => \$ignoreurl,
    "tmin=i"       => \$tmin,
    "tmax=i"       => \$tmax,
    "debug"        => \$debug,
    "ignorefile=s" => \$ignoreFile
  )
  or pod2usage(0);

pod2usage(1) if ($help);
pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;

# Remove trailing slash from basedir, we will add it ourself everywhere needed
$basedir =~ s/\/$//;

# Make sure some essential option values are supplied
if ( $oldpage eq "" ) {
    print
      "You did not supply the archive HTML file via the --archive option.\n";
    exit -1;
}
if ( $curpage eq "" ) {
    print
      "You did not supply the current HTML file via the --current option.\n";
    exit -1;
}
if ( $outpage eq "" ) {
    print "You did not supply the output HTML file via the --out option.\n";
    exit -1;
}

# Choose highlighting color
%colorList = (
    yellow => "#ffff99",
    blue   => "#66ccff",
    pink   => "#ffcccc",
    grey   => "#4c4c4c"
);
if ( defined $colorList{$hicolor} ) { $hicolor = $colorList{$hicolor}; }
if ( $hicolor eq "" ) { $hicolor = $colorList{"blue"}; }

# Other global variables
$changeStatus = 0;
@tags         = (
    "CODE",     "B",   "I",   "U",     "TT",     "EM",
    "FONT*",    "SUP", "SUB", "SMALL", "STRIKE", "STRONG",
    "CAPTION*", "A*"
);

# Read ignore keywords
if ( $ignore ne "none" ) {
    $ignore          = "," . $ignore . ",";
    $ignorelist      = "";
    $ignoreStartRead = 0;
    open( IGNORE, "< $basedir/$ignoreFile" )
      or die "Cannot open $basedir/$ignoreFile: $!\n";
    while (<IGNORE>) {
        chomp;
        s/^\s*//;
        s/\s*$//;

        # Ignore comments
        next if (m/^#/);
        # Stop with a finish marker
        last if (m/^__END__/);

        if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
        if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
        ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
        if ( $ignore =~ m:,$section,:i ) { $ignoreStartRead = 1; }
    }
    close( IGNORE );
    @ignore = split /[\r\n]/, $ignorelist;
}
if ($debug) {
    foreach (@ignore) { print "Ignore: $_\n"; }
}

# Read ignore urls
if ( $ignoreurl ne "none" ) {
    $ignoreurl       = "," . $ignoreurl . ",";
    $ignorelist      = "";
    $ignoreStartRead = 0;
    open( IGNORE, "< $basedir/$ignoreFile" )
      or die "Cannot open $basedir/$ignoreFile: $!\n";
    while (<IGNORE>) {
        chomp;
        s/^\s*//;
        s/\s*$//;

        # Ignore comments
        next if (m/^#/);
        # Stop with a finish marker
        last if (m/^__END__/);

        if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
        if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
        ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
        if ( $ignoreurl =~ m:,$section,:i ) { $ignoreStartRead = 1; }
    }
    close( IGNORE );
    @ignoreurl = split /[\r\n]/, $ignorelist;
}
if ($debug) {
    foreach (@ignoreurl) { print "IgnoreURL: $_\n"; }
}

# Undefine line separator so that we can read entire file at one go from now on
undef $/;

# Open input pages for comparing
open( OLDPAGE, "< $oldpage" ) or die "Cannot open $oldpage: $!\n";
open( CURPAGE, "< $curpage" ) or die "Cannot open $curpage: $!\n";

# Read input pages
$oldpage = <OLDPAGE>;
$newpage = <CURPAGE>;

# Close input pages
close(OLDPAGE);
close(CURPAGE);

# Mangle some HTML tags to a form suitable for analysis
$oldpage = &MangleHTML($oldpage, @tags);
$newpage = &MangleHTML($newpage, @tags);

# Parse old and new page
&TokenizePage($oldpage);
@oldtokens = @tokens;
$#tokens   = -1;
if ($debug) {
    foreach (@oldtokens) { print ">>>> $_\n"; }
}
&TokenizePage($newpage);
@newtokens = @tokens;
$#tokens   = -1;

# Parse new page
&PerformDiff();

# Restore tags which we have previously mangled
foreach $token (@newtokens) {
    $token =~ s/\@\@\@\@&nbsp;~~~~/&nbsp;/sig;
    foreach $tag (@tags) { $token =~ s/~~~~(\/*.*?)\@\@\@\@/<$1>/sig; }
}

# Open output file for writing
open( OUTPAGE, "> $outpage" ) or die "Cannot open $outpage: $!\n";
foreach (@newtokens) { print OUTPAGE "$_\n"; }
close(OUTPAGE);

# End of program
if ( !$changeStatus ) {
    if ($debug) { print "No changes were detected.\n"; }
}
exit $changeStatus;

# Convert page to tokens
sub TokenizePage() {
    my $page = shift (@_);
    @tokens = split /(<.*?>)/s, $page;
    foreach (@tokens) { s/^\s+//sig; }
    foreach (@tokens) { s/\s+$//sig; }
}

# Perform diff between two pages
sub PerformDiff() {
    my $commentOn   = 0;
    my $scriptOn    = 0;
    my $styleOn     = 0;
    my $titleOn     = 0;
    my $ignoreUrlOn = 0;

    foreach $token (@newtokens) {
        if ( $token eq "" ) { next; }
        if ($debug) { print "<<<< $token\n"; }

        if ( $token =~ m|^.*?<!-.*?$| ) { $commentOn = 1; }
        if ( $token =~ m|^.*?->.*?| )   { $commentOn = 0; next; }

        if ( $token =~ m|^.*?<TITLE.*?>$|i )  { $titleOn = 1; }
        if ( $token =~ m|^.*?</TITLE.*?>$|i ) { $titleOn = 0; next; }

        if ( $token =~ m|^.*?<SCRIPT.*?>$|i )  { $scriptOn = 1; }
        if ( $token =~ m|^.*?</SCRIPT.*?>$|i ) { $scriptOn = 0; next; }

        if ( $token =~ m|^.*?<STYLE.*?>$|i )  { $styleOn = 1; }
        if ( $token =~ m|^.*?</STYLE.*?>$|i ) { $styleOn = 0; next; }

        if ( TokenContainsIgnoreURL($token) ) { $ignoreUrlOn = 1; }
        if ( $ignoreUrlOn && TokenContainsHlinkEnd($token) ) {
            $ignoreUrlOn = 0;
            next;
        }

        if ($commentOn) {
            if ($debug) { print "#### Token is within comment block.\n"; }
        }
        elsif ($titleOn) {
            if ($debug) { print "#### Token is within title block.\n"; }
        }
        elsif ($scriptOn) {
            if ($debug) { print "#### Token is within Javascript block.\n"; }
        }
        elsif ($styleOn) {
            if ($debug) { print "#### Token is within stylesheet block.\n"; }
        }
        elsif ($ignoreUrlOn) {
            if ($debug) {
                print "#### Token contains ignore URL - $lastIgnoreURL\n";
            }
        }
        elsif ( $token =~ m/<.*?>/sig ) {
            if ($debug) { print "#### Token is a HTML tag.\n"; }
        }
        elsif ( TokenIsMangledHTMLTag($token) ) {
            if ($debug) { print "#### Token is a mangled HTML tag.\n"; }
        }
        elsif ( TokenContainsIgnoreKeyword($token) ) {
            if ($debug) {
                print
                  "#### Token contains ignore keyword - $lastIgnoreKeyword\n";
            }
        }
        elsif ( TokenExists($token) ) {
            if ($debug) { print "#### Token exists in old page.\n"; }
        }
        else {
            if ($debug) { print "#### Token has been highlighted!\n"; }
            if ($asciimarker) {
                $token = "###>>>". $token ."<<<###";
            }
            $token =
                    "<span style=\"background-color: $hicolor\">"
                    . $token . "</span>";
            $changeStatus = 1;
        }
    }
}

# Check if token is a mangled HTML tag
sub TokenIsMangledHTMLTag() {
    my $token = shift (@_);

    while ( $token ne "" ) {
        if ( $token =~ m/^\s*(.*?)\s*~~~~.*?\@\@\@\@\s*(.*?)\s*$/i ) {
            $token = $2;
            if ( !$1 =~ m/^\s*$/ ) { return 0; }
        }
        else { return 0; }
    }
    return 1;
}

# Check if token contains any keyword in ignore list
sub TokenContainsIgnoreKeyword() {
    my $token = shift (@_);
    $token =~ s/\s{2,}/ /sig;

    # If this token contains >= tmax no. of words, do not ignore
    $tokdup = &ReduceSpaces($token);
    @words = split /\s/, $tokdup;
    if ($debug) { print "#### C" . ( $#words + 1 ) . ": $tokdup\n"; }
    if ( $#words + 1 > $tmax ) { return 0; }

    foreach $keyword (@ignore) {
        if ( $token =~ m/^.*?(\b$keyword\b).*?$/i
            || $tokdup =~ m/^.*?(\b$keyword\b).*?$/i )
        {
            $lastIgnoreKeyword = $keyword;
            return 1;
        }
    }
    return 0;
}

# Check if token already exists
sub TokenExists() {
    my $token = shift (@_);
    $token =~ s/\s{2,}/ /sig;

    # If this token contains <= tmin no. of words, don't check
    $tokdup = &ReduceSpaces($token);
    @words = split /\s/, $tokdup;
    if ( $#words + 1 <= $tmin ) { return 1; }

    foreach $oldtok (@oldtokens) {
        $oldtok =~ s/\s{2,}/ /sig;
        if ( $token eq $oldtok ) { return 1; }
    }
    return 0;
}

# Check if token contains ignore URL
sub TokenContainsIgnoreURL() {
    my $token = shift (@_);
    $token =~ s/\s{2,}/ /sig;

    foreach $url (@ignoreurl) {
        if ( $token =~ m/~~~~A.*?HREF=.*?$url.*?\@\@\@\@/i ) {
            $lastIgnoreURL = $url;
            return 1;
        }
    }
    return 0;
}

# Check if token contains end of hyperlink
sub TokenContainsHlinkEnd() {
    my $token = shift (@_);
    $token =~ s/\s{2,}/ /sig;
    return 1 if $token =~ m/~~~~\/A\@\@\@\@/i;
    return 0;
}

sub MangleHTML() {
    my $page = shift(@_);
    my @tags = shift(@_);

    $page =~ s/[\r\n]|\s\s/ /sig;    # Handle MSDOS-style line separators
    $page =~ s/&nbsp;/\@\@\@\@&nbsp;~~~~/sig;   # Handle non-breaking white space
    $page =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>/~~~~A$1~~~~$2\@\@\@\@$3\@\@\@\@/sig;    # Handle nested brackets
    foreach (@tags) {
        $tag = $_;
        $page =~ s/<(\/*$tag)>/~~~~$1\@\@\@\@/sig;
        if ( $tag =~ s/\*/ / ) { # XXX WTF is going here with the re?
            $page =~ s/<(\/*$tag.*?)>/~~~~$1\@\@\@\@/sig;
        }
    }

    return $page;
}

sub ReduceSpaces() {
    my $token = shift(@_);
    
    $token =~ s/\@\@\@\@&nbsp;~~~~/ /sig;
    $token =~ s/~~~~/</sig;
    $token =~ s/\@\@\@\@/>/sig;
    $token =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>//sig;
    $token =~ s/<[^>]*>//sig;
    $token =~ s/^\s*//sig;
    $token =~ s/\s*$//sig;
    $token =~ s/\s+/ /sig;

    return $token;
}

__END__

=head1 NAME

webdiff - Find and Highlight Differences Between Webpages

=head1 SYNOPSIS

webdiff [options]


=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--archive>=<pathname>

Archive HTML file

=item B<--current>=<pathname>

Current HTML file

=item B<--out>=<pathname>

Output HTML file (with highlighting)

=item B<--basedir>=<pathname>

Base directory for files

=item B<--hicolor>=<color>

Highlight color (Def: blue, yellow, pink, grey or #rrggbb)

=item B<--ignore>=<filelist>

Comma-delimited list of named sections containing ignore keywords

=item B<--ignoreurl>=<filelist>

Comma-delimited list of named sections containing ignore urls

=item B<--tmin>=<number>

Don't check if token contains <= given no. of words

=item B<--tmax>=<number>

Don't ignore if token contains >= given no. of words

=item B<--debug>

Debug messages

=back

=head1 DESCRIPTION

B<webdiff> will compare two webpages and create an output file with the changes
highlighted.


B<webdiff> is internal to B<websec> and isn't well documented.


=head1 SEE ALSO

L<websec(1)>


=head1 AUTHOR

Victor Chew is the original author of this software and
Baruch Even is continuing the maintenance.

=cut

vim:set et ts=4: