package RSSLite; ## ## Copyright (c) 2000 Scott Thomason. All rights reserved. ## This program is free software; you can redistribute it ## and/or modify it under the same terms as Perl itself. ## use strict; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); use Exporter; @ISA = ('Exporter'); @EXPORT = qw/parseXML usableXML/; @EXPORT_OK = qw/parseXML usableXML isRSS isRDF isSN isWL xml_content_string xml_content_array/; $VERSION = '0.06'; use Carp; use Data::Dumper; sub parseXML { my ($rr, $cr) = @_; die "Parms to 'parse' must be refs to a hash and XML content!" unless (ref($rr) and ref($cr)); return unless $$cr; ## Gotta have some content to parse my $type = usableXML($cr) or die "Content must be RSS/RDF/ScriptingNews/Weblog XML " . "(or something pretty close)"; preprocess($cr); if ($type == 1 or $type == 2) { parseRSS($rr, $cr); } elsif ($type == 3) { parseSN($rr, $cr); } elsif ($type == 4) { parseWL($rr, $cr); } else { die "Screwed up XML type-checking somehow!"; } postprocess($rr); } sub preprocess { my $cr = shift; ## ## Help create "well-formed" XML so parser doesn't puke by ## 1. Making unix-style line endings ## 2. Using & for & (this screws up urls, but we fix it later) ## 3. Removing objectionable characters ## $$cr =~ s|<(/*)rss\d+:(.*?)>|<$1$2>|g; $$cr =~ s|<([^<> ]+)\s+(.+?)\s+/>|<$1 $2>|g; $$cr =~ s/\r\n?/\n/g; $$cr =~ s/&(?!([a-zA-Z0-9]+|#\d+);)/&/g; $$cr =~ s/[^\s\d\w!@#\$%^&\*i\(\)\-\+=:;"'<>,\.\/\?]/ /g; ## Tidy up for debugging by starting open tags on new line # $content =~ s|(?!\n)<(?!/)|\n<|gs; } sub postprocess { my $rr = shift; $rr->{'link'} =~ s/&/&/gi; if (defined($rr->{'items'})) { my $i; foreach $i (@{$rr->{'items'}}) { $i->{'link'} = trim($i->{'link'}); # Put stuff into the right name if necessary if (defined($i->{'url'}) and not $i->{'link'}) { $i->{'link'} = $i->{'url'}; } # Fix pre-process munging $i->{'link'} =~ s/&/&/gi; # See if you can use misplaced url in title for empty links if (not $i->{'link'}) { if ($i->{'title'} =~ /^http:/) { $i->{'link'} = $i->{'title'}; } elsif ($i->{'title'} =~ /"(http.*?)"/) { $i->{'link'} = $1; $i->{'title'} =~ s/<.*?>//; } else { next; } } # Make sure you've got an http/ftp link if ($i->{'link'} !~ m{^(http|ftp)://}i) { ## Rip link out of anchor tag $i->{'link'} =~ m{a\s+href=("|")?(.*?)("|>|"|>)?}i; if ($2) { $i->{'link'} = $2; } elsif ($i->{'link'} =~ m{[\.#/]}i and $rr->{'link'} =~ m{^http://}) { ## Smells like a relative url if (substr($i->{'link'}, 0, 1) ne '/') { $i->{'link'} = '/' . $i->{'link'}; } $i->{'link'} = $rr->{'link'} . $i->{'link'}; } else { next; } } $i->{'link'} =~ s/ //g; } } } sub parseRSS { my ($rr, $cr) = @_; my $channel = xml_content_string('channel', $cr); $channel =~ s|||gis; clean(\$channel); my $ca; my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); foreach $ca (@channel_attrs) { $ca =~ m|^<(.*?)>(.*?)$|; $rr->{$1} = trim($2); } $rr->{'items'} = (); my $item; foreach $item (xml_content_array('item', $cr)) { clean(\$item); my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); my $ia; my %ia; foreach $ia (@item_attrs) { $ia =~ m|^<(.*?)>(.*?)$|; $ia{$1} = trim($2); } push(@{$rr->{'items'}}, \%ia); } } sub parseSN { my ($rr, $cr) = @_; my $channel = xml_content_string('header', $cr); $channel =~ s|||gis; clean(\$channel); my $ca; my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); foreach $ca (@channel_attrs) { $ca =~ m|^<(.*?)>(.*?)$|; $rr->{$1} = trim($2); } ## ## Alias SN to RSS terms ## if (exists $rr->{'channelDescription'}) { $rr->{'description'} = $rr->{'channelDescription'}; } if (exists $rr->{'channelTitle'}) { $rr->{'title'} = $rr->{'channelTitle'}; } if (exists $rr->{'channelLink'}) { $rr->{'link'} = $rr->{'channelLink'}; } $rr->{'items'} = (); my $item; foreach $item (xml_content_array('item', $cr)) { clean(\$item); my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); my $ia; my %ia; foreach $ia (@item_attrs) { $ia =~ m|^<(.*?)>(.*?)$|; $ia{$1} = trim($2); } # Links are nested, kill prev {'link'} and rebuild attrs inside it undef $ia{'link'}; my @linkitems = xml_content_array('link', \$item) or next; my $linkitem = $linkitems[0]; ## Usually first one is most relevant @item_attrs = ($linkitem =~ m|(<.*?>.*?)|gi); foreach $ia (@item_attrs) { $ia =~ m|^<(.*?)>(.*?)$|; $ia{$1} = trim($2); } # Alias SN to RSS if (exists $ia{'text'}) { $ia{'description'} = $ia{'text'}; } if (exists $ia{'linetext'}) { $ia{'title'} = $ia{'linetext'}; } if (exists $ia{'url'}) { $ia{'link'} = $ia{'url'}; } push(@{$rr->{'items'}}, \%ia); } } sub parseWL { my ($rr, $cr) = @_; # my $channel = xml_content_string('header', $cr); # $channel =~ s|||gis; # clean(\$channel); # my $ca; # my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); # foreach $ca (@channel_attrs) { # $ca =~ m|^<(.*?)>(.*?)$|; # $rr->{$1} = trim($2); # } ## ## Alias SN to RSS terms ## # if (exists $rr->{'channelDescription'}) { # $rr->{'description'} = $rr->{'channelDescription'}; # } # if (exists $rr->{'channelTitle'}) { # $rr->{'title'} = $rr->{'channelTitle'}; # } # if (exists $rr->{'channelLink'}) { # $rr->{'link'} = $rr->{'channelLink'}; # } $rr->{'items'} = (); my $item; foreach $item (xml_content_array('link', $cr)) { clean(\$item); my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); my $ia; my %ia; foreach $ia (@item_attrs) { $ia =~ m|^<(.*?)>(.*?)$|; $ia{$1} = trim($2); } # Alias WL to RSS if (exists $ia{'url'}) { $ia{'link'} = $ia{'url'}; } push(@{$rr->{'items'}}, \%ia); } } sub usableXML { my $cref = shift; my $content = $$cref; ## Don't change caller's content just for usability check preprocess(\$content); return 1 if isRSS(\$content); return 2 if isRDF(\$content); return 3 if isSN(\$content); return 4 if isWL(\$content); return 0; } sub isRSS { my $cref = shift; return scalar($$cref =~ /.*<\/rss>/is); } sub isRDF { my $cref = shift; return scalar($$cref =~ /.*<\/rdf:RDF>/is); } sub isSN { my $cref = shift; return scalar($$cref =~ /.*<\/scriptingnews>/is); } sub isWL { my $cref = shift; return scalar($$cref =~ /.*<\/weblog>/is); } sub xml_content_string { my $tag = shift; my $cref = shift; $$cref =~ /<${tag}.*?>(.*)<\/${tag}>/is; return $1; } sub xml_content_array { my $tag = shift; my $cref = shift; my $keeptags = shift; $keeptags = 0 unless $keeptags; my @result; if ($keeptags) { @result = ($$cref =~ /(<${tag}.*?>.*?<\/${tag}>)/gis); } else { @result = ($$cref =~ /<${tag}.*?>(.*?)<\/${tag}>/gis); } return @result; } sub clean { my $cref = shift; $$cref =~ s{(\n|

|

|||||||||
|
||)}{ }gsi; } sub trim { my $s = shift; $s =~ s/^\s*(.*?)\s*$/$1/; return $s; } 1;