Perlanet::Trait::Scrubber - clean posts with HTML::Scrubber before aggregating


Perlanet documentation Contained in the Perlanet distribution.

Index


Code Index:

NAME

Top

Perlanet::Trait::Scrubber - clean posts with HTML::Scrubber before aggregating

DESCRIPTION

Top

Before adding a post to the aggregated feed, it will first be cleaned with HTML::Scrubber.

ATTRIBUTES

Top

scrubber

Top

An instance of HTML::Scrubber used to remove unwanted content from the feed entries. For default settings see source of Perlanet.pm.

AUTHOR

Top

Dave Cross, <dave@mag-sol.com>

COPYRIGHT AND LICENSE

Top


Perlanet documentation Contained in the Perlanet distribution.
package Perlanet::Trait::Scrubber;
use Moose::Role;
use namespace::autoclean;

use HTML::Scrubber;

has 'scrubber' => (
  is         => 'rw',
  lazy_build => 1
);

sub _build_scrubber {
  my $self = shift;

  my %scrub_rules = (
    img => {
      src   => qr{^http://},    # only URL with http://
      alt   => 1,               # alt attributes allowed
      align => 1,               # allow align on images
      style => 1,
      '*'   => 0,               # deny all others
    },
    style => 0,
    script => 0,
    span => {
      id => 0,                  # blogger(?) includes spans with id attribute
    },
    a => {
      href => 1,
      '*'  => 0,
    },
  );

  # Definitions for HTML::Scrub
  my %scrub_def = (
    '*'           => 1,
    'href'        => qr{^(?!(?:java)?script)}i,
    'src'         => qr{^(?!(?:java)?script)}i,
    'cite'        => '(?i-xsm:^(?!(?:java)?script))',
    'language'    => 0,
    'name'        => 1,
    'value'       => 1,
    'onblur'      => 0,
    'onchange'    => 0,
    'onclick'     => 0,
    'ondblclick'  => 0,
    'onerror'     => 0,
    'onfocus'     => 0,
    'onkeydown'   => 0,
    'onkeypress'  => 0,
    'onkeyup'     => 0,
    'onload'      => 0,
    'onmousedown' => 0,
    'onmousemove' => 0,
    'onmouseout'  => 0,
    'onmouseover' => 0,
    'onmouseup'   => 0,
    'onreset'     => 0,
    'onselect'    => 0,
    'onsubmit'    => 0,
    'onunload'    => 0,
    'src'         => 1,
    'type'        => 1,
    'style'       => 1,
    'class'       => 0,
    'id'          => 0,
  );

  my $scrub = HTML::Scrubber->new;
  $scrub->rules(%scrub_rules);
  $scrub->default(1, \%scrub_def);

  return $scrub;
}

around 'clean_html' => sub {
  my $orig = shift;
  my ($self, $html) = @_;
  $html = $self->$orig($html);
  my $scrubbed = $self->scrubber->scrub($html);
  return $html;
};

1;