dirtyRSS - A dirty but self-contained RSS parser


dirtyRSS documentation Contained in the dirtyRSS distribution.

Index


Code Index:

NAME

Top

dirtyRSS - A dirty but self-contained RSS parser

SYNOPSIS

Top

  use dirtyRSS;

  $tree = parse($in);

  die("$tree\n") unless (ref $tree);

  disptree($tree, 0);

DESCRIPTION

Top

dirtyRSS is a terribly dirty RSS parser, which doesn't require any other module to work. It parses a string, and creates a tree, which represents the RSS feed.

It doesn't support the complete XML syntax, only things that are commonly used in feeds.

All tags are lowercased, namespace indicators are removed, and several typical non-RSS-2.0 tags are translated shamelessly to their 2.0 counterpart. There is also plenty of fiddling with the data on the way.

The only good thing about this parser, is that it works most of the time, and it makes the tree look as if it came from an RSS 2.0, for a large parts of feeds of various sorts.

If the parse fails, an error message is passed via the return value, rather than a reference to an array.

EXPORT

Top

The following functions are exported:

parse() and disptree().

BUGS

Top

The module is based upon trials and errors, so naturally there are going to be more errors.

LICENSE

Top

This module is released to the open domain. There are no restrictions using it.

HISTORY

Top

The module is part of the Editaste site, http://www.editaste.com/rawlist

AUTHOR

Top

Eli Billauer, <perldev@editaste.com>


dirtyRSS documentation Contained in the dirtyRSS distribution.

package dirtyRSS;

use strict;
use warnings;

require Exporter;

@dirtyRSS::ISA = qw[Exporter];
@dirtyRSS::EXPORT = qw[&parse &disptree];
$dirtyRSS::VERSION = '0.3';

our %htmlescapes = (
		    'quot' => 34,
		    'amp' => 38,
		    'apos' => 39,
		    'lt' => 60,
		    'gt' => 62,
		    'nbsp' => 32, # Was 160, but we make it a normal space
		    'iexcl' => 161,
		    'cent' => 162,
		    'pound' => 163,
		    'curren' => 164,
		    'yen' => 165,
		    'brvbar' => 166,
		    'sect' => 167,
		    'uml' => 168,
		    'copy' => 169,
		    'ordf' => 170,
		    'laquo' => 171,
		    'not' => 172,
		    'shy' => 173,
		    'reg' => 174,
		    'macr' => 175,
		    'deg' => 176,
		    'plusmn' => 177,
		    'sup2' => 178,
		    'sup3' => 179,
		    'acute' => 180,
		    'micro' => 181,
		    'para' => 182,
		    'middot' => 183,
		    'cedil' => 184,
		    'sup1' => 185,
		    'ordm' => 186,
		    'raquo' => 187,
		    'frac14' => 188,
		    'frac12' => 189,
		    'frac34' => 190,
		    'iquest' => 191,
		    'agrave' => 192,
		    'aacute' => 193,
		    'acirc' => 194,
		    'atilde' => 195,
		    'auml' => 196,
		    'aring' => 197,
		    'aelig' => 198,
		    'ccedil' => 199,
		    'egrave' => 200,
		    'eacute' => 201,
		    'ecirc' => 202,
		    'euml' => 203,
		    'igrave' => 204,
		    'iacute' => 205,
		    'icirc' => 206,
		    'iuml' => 207,
		    'eth' => 208,
		    'ntilde' => 209,
		    'ograve' => 210,
		    'oacute' => 211,
		    'ocirc' => 212,
		    'otilde' => 213,
		    'ouml' => 214,
		    'times' => 215,
		    'oslash' => 216,
		    'ugrave' => 217,
		    'uacute' => 218,
		    'ucirc' => 219,
		    'uuml' => 220,
		    'yacute' => 221,
		    'thorn' => 222,
		    'szlig' => 223,
		    'agrave' => 224,
		    'aacute' => 225,
		    'acirc' => 226,
		    'atilde' => 227,
		    'auml' => 228,
		    'aring' => 229,
		    'aelig' => 230,
		    'ccedil' => 231,
		    'egrave' => 232,
		    'eacute' => 233,
		    'ecirc' => 234,
		    'euml' => 235,
		    'igrave' => 236,
		    'iacute' => 237,
		    'icirc' => 238,
		    'iuml' => 239,
		    'eth' => 240,
		    'ntilde' => 241,
		    'ograve' => 242,
		    'oacute' => 243,
		    'ocirc' => 244,
		    'otilde' => 245,
		    'ouml' => 246,
		    'divide' => 247,
		    'oslash' => 248,
		    'ugrave' => 249,
		    'uacute' => 250,
		    'ucirc' => 251,
		    'uuml' => 252,
		    'yacute' => 253,
		    'thorn' => 254,
		    'yuml' => 255
);

# These are typical HTML tags, which should be omitted.

our %ignore_tags = (
		    'img'     => 1,
		    'a'       => 1,
		    'p'       => 1,
		    'br'      => 1,
		    'div'     => 1,
		    'span'    => 1,
		    'b'       => 1,
		    'i'       => 1,
		    'u'       => 1,
		    'body'    => 1,
		    'center'  => 1,
		    'code'    => 1,
		    'font'    => 1,
		    'form'    => 1,
		    'h1'      => 1,
		    'h2'      => 1,
		    'h3'      => 1,
		    'h4'      => 1,
		    'head'    => 1,
		    'hr'      => 1,
		    'html'    => 1,
		    'li'      => 1,
		    'ul'      => 1,
		    'ol'      => 1,
		    'pre'     => 1,
		    'style'   => 1,
		    'sub'     => 1,
		    'sup'     => 1,
		    'script'  => 1,
		    'small'   => 1,
		    'big'     => 1,
		    'table'   => 1,
		    'td'      => 1,
		    'tr'      => 1,
		    'th'      => 1,
		    'textarea'=> 1,
		    'strong'  => 1,
		    'strike'  => 1,
		    'blockquote' => 1,
	      );

our %ns = (
	   # RSS 2.0 tags
	   'xml'          => 'xml',
	   'rss'          => 'rss',
	   'rdf'          => 'rdf',
	   'item'         => 'item',
	   'channel'      => 'channel',
	   'image'        => 'image',
	   'title'        => 'title',
	   'link'         => 'link',
	   'description'  => 'description',
           'language'     => 'language',
	   'copyright'    => 'copyright',
	   'pubdate'      => 'pubdate',
	   'lastbuilddate'=> 'lastbuilddate',
	   'category'     => 'category',
	   'generator'    => 'generator',
	   'ttl'          => 'ttl',
	   'url'          => 'url',
	   'width'        => 'width',
	   'height'       => 'height',
	   'version'      => 'version',
	   'encoding'     => 'encoding',
	   'guid'         => 'guid',
	   'enclosure'    => 'enclosure',

	   # RSS 1.0 tags translated to RSS 2.0
	   'subject'      => 'category',
	   'rights'       => 'copyright',
	   'modified'     => 'lastbuilddate',
	   'date'         => 'pubdate',
	   'resource'     => 'resource', # 1.0 specific!

	   # Atom 1.0 tags translated to RSS 2.0
	   'feed'      => 'channel',
	   'summary'   => 'description',
	   'content'   => 'description',
	   'subtitle'  => 'description',
	   'lang'      => 'language',
	   'published' => 'pubdate',
	   'updated'   => 'lastbuilddate',
	   'logo'      => 'image',
	   'entry'     => 'item',
	   'href'      => 'link',
	  );

# Note that %specials refer to the *right* side of %ns, so only one
# entry is needed for each functional tag or its alias

# TRUE means array type
our %specials = (
		 'item' => 1,
		 'channel' => 1,
		 'image' => 1,
		 'xml' => 0,
		 'rss' => 0,
		 'rdf' => 0,
		);

sub parse {
  my ($in, $debug) = @_;
  
  $in =~ s/<!--.*?-->//gs; # Remove comments

  my @segs = map { /^[ \n\r\t]*(.*?)[ \n\r\t]*$/s } ($in =~ /(<!\[CDATA\[.*?\]\]>|<[^>]+?>|[^<]+)/gs);
  
  # Strip off CDATAs. Added a prefix space to avoid accidental tag hits
  @segs = map { /^<!\[CDATA\[(.*?)\]\]>$/s ? " $1" : $_ } @segs;

  @segs = grep { length > 0 } @segs;
  
  my @stack = ();
  my @valstack = ();
  my %tree = ();
  my $here = \%tree;
  my @parent = ();
  my $lastval = "";
  
  foreach my $elem (@segs) {
    my ($modifier, $tag, $attr, $empty) = ($elem =~ /^<([!?\#]{0,1})[ \n\r\t]*([^ \n\r\t]*[^ \/\n\r\t])[ \n\r\t]*(.*?)[ \n\r\t]*(\/{0,1})>$/s);

    $empty = 1 if ($modifier); 
    
    if (defined $tag) {
      $tag = lc $tag;		# We're case-insensitive

      # Note that the regex below removes "dc:"-like namespace prefices
      my $closing;
      ($closing, $tag) = ($tag =~ /^(\/{0,1}).*?:{0,1}([^:]*)$/);

      if ($ignore_tags{$tag}) {
	htmltags($here, unescape($elem));
	next;
      }
      
      unless ($closing) {	# Opening tags...
	push @stack, $tag;
	
	my $alias = $ns{$tag};
	
	if (defined $alias) {
	  push @valstack, $lastval;
	  $lastval = "";

	  if (defined $specials{$alias}) {
	    push @parent, $here;
	    $here = {};	    
	  }

	  # Note that attributes may pollute the parent hash. This is
	  # necessary to support Atom 1.0

	  my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']*?\'|[^ \n\r\t]+?=\"[^\"]*?\"|[^ \n\r\t]+?=[^ \n\r\t]*)/g);
	  
	  foreach my $p (@pairs) {	    
	    my ($k, $v) = ($p =~ /(.+?)=(.*)/);

	    $k = lc $k;

	    $v = $1 
	      if (($v =~ /^\'(.*)\'$/s) || ($v =~ /^\"(.*)\"$/s));    
	    
	    ($k) = ($k =~ /([^:]*)$/); # Remove namespace prefix if present
	    
	    my $alias = $ns{$k};
	    
	    if (defined $alias) {
	      $here->{$alias} = unescape($v);
	    } else {
	      warn "Ignored attribute $k=$v\n"
		if $debug;
	    }
	  }	 
	} else {
	  warn "Ignored tag $tag\n"
	    if $debug;
	}
      }
      
      if ($closing || $empty) { # Closing tags, or close an empty opening tag
	my $p = pop @stack;
	
	return "Bad XML tag nesting. Expected end tag for '$p', got '/$tag'"
	  unless ($p eq $tag);
	
	my $alias = $ns{$tag};
	
	if (defined $alias) {
	  my $thislastval = $lastval;
	  $lastval = pop @valstack;
  
	  if (defined $specials{$alias}) {
	    my $parent = pop @parent;
	    
	    if ($specials{$alias}) { # Array type
	      $parent->{$alias} = []
		unless ((ref $parent->{$alias}) &&
			(ref $parent->{$alias}) eq 'ARRAY');
	      push @{$parent->{$alias}}, $here;
	    } else {
	      $parent->{$alias} = $here;
	    }
	    
	    # UGLY HACK ALERT:
	    # Just before leaving a tree node, we clean the 'description'
	    # from possible HTML tags, and harvest the relevant values,
	    # if applicable. This is because some feeds think that the
	    # description should be rendered on a browser as is (cross
	    # scripting, anybody?)

	    $here->{'description'} =~ s/(<.*?>)/htmltags($here, $1)/ges
	      if (defined $here->{'description'});
	   
	    $here = $parent;	
	  } else {
	    $here->{$alias} = unescape($thislastval)
	      unless ((length($thislastval) == 0) &&
		      (defined $here->{$alias}) &&
		      (length $here->{$alias}));	    
	  }
	}
      }
    } else {
      $lastval = (length $lastval) ? "$lastval $elem" : $elem;    
    }
    
  }
  return("Bad XML nesting: There were unclosed tags at EOF")
    if (@stack);
  
  return \%tree;
  
}

sub htmltags {
  my ($here, $seg) = @_;

  my ($tag, $attr) = ($seg =~ /^<[ \n\r\t]*([^ \n\r\t]+)[ \n\r\t]*(.*?)[ \n\r\t]*>$/s);

  return "" unless (defined $tag);

  $tag = lc $tag;

  # Respect HTML line breaks, even though the renderer won't
  return "\n"
    if (($tag eq 'p') || ($tag eq 'br'));

  if (($tag eq 'img') && !(defined $here->{'altimage'})) {
    my $new = {};
    $here->{'altimage'} = $new;
    $here = $new;
  } elsif (($tag eq 'a') && !(defined $here->{'altlink'})) {
    my $new = {};
    $here->{'altlink'} = $new;
    $here = $new;
  } else { return ""; }
	   
  my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']*?\'|[^ \n\r\t]+?=\"[^\"]*?\"|[^ \n\r\t]+?=[^ \n\r\t]*)/g);
  
  foreach my $p (@pairs) {	    
    my ($k, $v) = ($p =~ /(.+?)=(.*)/);
    
    $k = lc $k;

    $v = $1 
      if (($v =~ /^\'(.*)\'$/s) || ($v =~ /^\"(.*)\"$/s));    
    
    $here->{$k} = $v;
  }
  
  return ""; # This makes the function useful in substitutions
}

sub single_unescape {
  my ($ent) = @_;

  my $ord = $htmlescapes{lc($ent)};
  return chr($ord) if defined $ord;
  return ""; # Conversion failed, return nothing
}

sub unescape {
  # Note! Unicode characters are escaped to space!
  my ($x) = @_;
  # For now, we go wild, and convert all escape markers

  # Run twice, because of double-nested markups :-O
  for (my $i=0; $i<2; $i++) {
    $x =~ s/&(\w+);/single_unescape($1)/ge;
    $x =~ s/&\#(\d+);/chr($1 < 256 ? $1 : 32)/ge;
    $x =~ s/&\#x([0-9a-fA-F]+);/chr(hex($1) < 256 ? hex($1) : 32)/ige;
  }
  return $x;
}

sub disptree {
  my ($what, $s) = @_;

  foreach my $k (sort keys %{$what}) {
    my $v = $what->{$k};

    if ((ref $v) eq 'HASH') {
      print " "x$s."$k\n";
      disptree($v, $s+2);
      next;
    }

    if ((ref $v) eq 'ARRAY') {
      my $count;
      for ($count=0; $count<=$#{$v}; $count++) {
	print " "x$s.$k."[$count]\n";
	disptree($v->[$count], $s+2);
      }
      next;
    }

    print " "x$s."$k => $v\n";    
  }    
}

1;

__END__