/usr/local/CPAN/gmuck/HTML/GMUCK.pm


package HTML::GMUCK;

# $Id: GMUCK.pm,v 1.24 2007/04/01 20:26:55 scop Exp $

use strict;

require 5.006;

use vars qw($VERSION $Tag_End $Tag_Start $Non_Tag_End
            $URI_Attrs $End_Omit $All_Elems
            $Min_Elems $Compat_Elems $Min_Attrs $MIME_Type @MIME_Attrs
            %Req_Attrs $All_Attrs $Depr_Elems @Depr_Attrs @Int_Attrs
            @Length_Attrs @Fixed_Attrs);

use Carp qw(carp);

no warnings 'utf8';

BEGIN
{

  $VERSION = sprintf("%d.%02d", q$Revision: 1.24 $ =~ /(\d+)\.(\d+)/);

  # --- Preload regexps.

  my $tmp = '';
  my %tmp = ();

  if (! do 'HTML/GMUCK/regexps.pl') {
    my $err = $! || $@;
    die "Error reading HTML/GMUCK/regexps.pl: $err";
  }

}

# ----- Constructors -------------------------------------------------------- #

sub new
{
  my ($class, %attr) = @_;

  my $this = bless({
                    _mode         => undef,
                    _xml          => undef,
                    _xhtml        => undef,
                    _html         => undef,
                    _tab_width    => undef,
                    _num_errors   => undef,
                    _num_warnings => undef,
                    _quote        => undef,
                    _min_attrs    => undef,
                   },
                   (ref($class) || $class));

  my $tab_width = delete($attr{tab_width});
  $tab_width = 4 unless defined($tab_width);
  $this->tab_width($tab_width) or $this->tab_width(4);

  my $mode = delete($attr{mode});
  $mode = 'XHTML' unless defined($mode);
  $this->mode($mode) or $this->mode('XHTML');

  my $quote = delete($attr{quote});
  $this->quote(defined($quote) ? $quote : '"');

  $this->min_attributes(delete($attr{min_attributes}));

  $this->reset();

  if (my @unknown = keys(%attr)) {
    carp("** Unrecognized attributes: " . join(',', sort(@unknown)));
  }

  return $this;
}

# ---------- Check: deprecated ---------------------------------------------- #

sub deprecated { return shift->_wrap('_deprecated', @_);}

sub _deprecated
{
  my ($this, $line) = @_;
  my @errors = ();

  while ($line =~ /\b(document\.location)\b/go) {
    push(@errors, { col  => $this->_pos($line, pos($line) - length($1)),
                    type => 'W',
                    mesg =>
                    'document.location is deprecated, use window.location ' .
                    'instead',
                  },
        );
  }

  # ---

  return @errors unless $this->{_html};

  # Optimization.
  return @errors unless $line =~ $Tag_Start;

  # ---

  while ($line =~ /
                  <
                  (\/?)
                  (
                    ($Depr_Elems)
                    (?:$|$Tag_End|\s)
                  )
                  /giox) {
    push(@errors, { col  => $this->_pos($line, pos($line) - length($2)),
                    elem => $3,
                    mesg => 'deprecated element' . ($1 ? ' end' : ''),
                    type => 'W',
                  },
        );
  }

  # ---

  foreach my $re (@Depr_Attrs) {

    while ($line =~ /$re/g) {
      my ($m, $elem, $attr) = ($1, $2, $3);
      if ($attr) {
        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        elem => $elem,
                        attr => $attr,
                        type => 'W',
                        mesg => 'deprecated attribute for this element',
                      },
            );
      }
    }
  }

  return @errors;
}

# ----- Check: attributes --------------------------------------------------- #

sub attributes { return shift->_wrap('_attributes', @_); }

sub _attributes
{
  my ($this, $line) = @_;
  return () unless $this->{_html};

  my @errors = ();

  # ---

  my $type = $this->{_xhtml} ? 'E' : 'W';

  # BUG: Does not catch non-lowercase minimized attributes, like CHECKED.
  while ($line =~ /
                  (?:^\s*|(?<=[\w\"\'])\s+)
                  (
                    ($All_Attrs)
                    =
                    (.\S?) # Would like to see ['"], possibly backslashed.
                  )
                  /giox) {

    my ($pos, $att, $q) = (pos($line) - length($1), $2, $3);

    if ($att ne lc($att)) {
      push(@errors, { col  => $this->_pos($line, $pos),
                      attr => $att,
                      type => $type,
                      mesg => 'non-lowercase attribute',
                    },
          );
    }

    if (my $tq = $this->{_quote}) {
      my $pos = $this->_pos($line, $pos + length($att) + 1);
      if ($q =~ /\\?([\"\'])/o) {
        if ($1 ne $tq) {
          push(@errors, { col  => $pos,
                          type => 'W',
                          attr => $att,
                          mesg => "quote attribute values with $tq",
                        },
              );
        }
      } else {
        push(@errors, { col  => $pos,
                        attr => $att,
                        type => 'W',
                        mesg => 'unquoted value',
                      },
            );
      }
    }
  }

  # ---

  # Optimization.
  return @errors unless $line =~ /$Tag_Start\w../o;

  # ---

  foreach my $re (@Int_Attrs) {

    my $msg = 'value should be an integer: "%s"';

    while ($line =~ /$re/g) {
      my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5);
      my $lel = lc($el);
      my $latt = lc($att);

      if ($val !~ /^\d+$/o &&
          $val !~ /[\\\$\(\[]/o   # bogus protection
         ) {

        # Special case: img->border only in HTML 4
        next if ($this->{_xhtml} && $lel eq 'img' && $latt eq 'border');

        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        type => 'E',
                        mesg => sprintf($msg, $val),
                        elem => $el,
                        attr => $att,
                      },
            );
      }
    }
  }

  # ---

  foreach my $re (@Length_Attrs) {

    my $msg = 'value should be an integer or a percentage: "%s"';

    while ($line =~ /$re/g) {

      my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5);

      if ($val !~ /^\d+%?$/o &&
          $val !~ /[\\\$\(\[]/o   # bogus protection
         ) {

        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        type => 'E',
                        mesg => sprintf($msg, $val),
                        elem => $el,
                        attr => $att,
                      },
            );
      }
    }
  }

  # ---

  foreach (@Fixed_Attrs) {

    my ($re, $vre, $vals) = @$_;
    $vre = $this->{_xml} ? qr/$vre/ : qr/$vre/i;
    my $msg = 'invalid value: "%s", should be %s"%s"';

    while ($line =~ /$re/g) {

      my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5);

      if ($val !~ $vre &&
          $val !~ /[\\\$\(\[]/o    # bogus protection
         ) {

        my $latt = lc($att);
        my $lel  = lc($el);

        # Special case: html->xmlns and pre,script,style->xml:space XHTML-only
        next if (! $this->{_xhtml} &&
                 (($lel eq 'html' && $latt eq 'xmlns') ||
                  ($latt eq 'xml:space' && $lel =~ /^(pre|s(cript|tyle))$/o)));

        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        type => 'E',
                        mesg => sprintf($msg, $val,
                                        ($vals =~ /\|/o) ? 'one of ' : '',
                                        $vals),
                        elem => $el,
                        attr => $att,
                      },
            );
      }
    }
  }

  # ---

  #
  # Note that minimized attributes are forbidden only in XHTML, but it
  # is legal to have them in HTML too.
  #
  # Not doing this check inside <>'s would result in too much bogus.
  #
  if ($this->{_min_attrs}) {
    while ($line =~ /
                      <
                      $Non_Tag_End+?
                      \s
                      (
                        ($Min_Attrs)
                        ([=\s]|$Tag_End)
                      )
                      /giox) {
      my ($m, $attr, $eq) = ($1, $2, $3);
      if ($eq ne '=') {
        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        attr => $attr,
                        type => $type,
                        mesg => 'minimized attribute',
                      },
            );
      }
    }
  }

  # ---

  while (my ($attr, $re) = each(%Req_Attrs)) {

    my $msg = 'missing required attribute: "%s"';

    # Parens: 1: for pos(), 2:element, 3: attribute (or undef if not found)
    while ($line =~ /$re/g) {

      my ($m, $el, $att) = ($1, $2, $3);

      if (! $att) {

        my $lel  = lc($el);

        # Special case: @name not required for input/@type's submit and reset
        next if ($lel eq 'input' && $attr eq 'name' &&
                 # TODO: this is crap
                 $line =~ /\stype=(\\?[\"\'])?(submi|rese)t\b/io);

        # Special case: map/@id required only in XHTML 1.0+
        next if ($lel eq 'map' && $attr eq 'id' && ! $this->{_xhtml});

        push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                        type => 'E',
                        mesg => sprintf($msg, $attr),
                        elem => $el,
                      },
            );
      }
    }
  }

  return @errors;
}

# ----- Check: MIME types --------------------------------------------------- #

sub mime_types { return shift->_wrap('_mime_types', @_); }

sub _mime_types
{
  my ($this, $line) = @_;
  return () unless $this->{_html};

  # Optimization. "<a type=" is the shortest we know nowadays.
  return () unless $line =~ /$Tag_Start.{6}/o;

  my @errors = ();
  my $msg = 'bad media type: "%s"';
  my $jsmsg =
    'not recommended media type: "%s", see RFC 4329 (and also CAVEATS in the HTML::GMUCK manual page)';

  foreach my $re (@MIME_Attrs) {

    while ($line =~ /$re/g) {

      my ($elem, $attr, $m, $mtype) = ($1, $2, $4, $5);
      my $pos = $this->_pos($line, pos($line) - length($m));

      if ($mtype !~ $MIME_Type) {
        push(@errors, { col  => $pos,
                        type => 'E',
                        elem => $elem,
                        attr => $attr,
                        mesg => sprintf($msg, $mtype),
                      },
            );
      } elsif (lc($elem) eq 'script' &&
               $mtype =~ /(ecm|jav)ascript/io &&
               lc($mtype) !~ '^application/(ecm|jav)ascript$') {
        push(@errors, { col  => $pos,
                        type => 'W',
                        elem => $elem,
                        attr => $attr,
                        mesg => sprintf($jsmsg, $mtype),
                      },
            );
      }
    }
  }

  return @errors;
}

# ----- Check: elements ----------------------------------------------------- #

sub elements { return shift->_wrap('_elements', @_); }

sub _elements
{
  my ($this, $line) = @_;
  return () unless $this->{_html};

  my @errors = ();

  # ---

  my $type = $this->{_xhtml} ? 'E' : 'W';
  my $msg = 'non-lowercase element%s';

  while ($line =~ /
                  <
                  (\/?)
                  (
                    ($All_Elems)
                    (\s|$Tag_End|\Z)   # \Z) because $) would screw my indentation :)
                  )
                  /giox) {
    my ($slash, $pos, $elem) = ($1, pos($line) - length($2), $3);
    if ($elem ne lc($elem)) {
      push(@errors, { col  => $this->_pos($line, $pos),
                      type => $type,
                      elem => $elem,
                      mesg => sprintf($msg, ($slash ? ' end' : '')),
                    },
          );
    }
  }

  # ---

  $msg = 'missing end tag';

  while ($line =~ /
                  <
                  (
                    ($End_Omit)
                    .*?
                    $Tag_End
                    [^<]*
                    <
                    (.?)
                    ($End_Omit)
                  )
                  /giox) {
    my ($m, $start, $slash, $end) = ($1, $2, $3, $4);
    if ((lc($start) eq lc($end) && $slash ne '/') ||
        # TODO: this needs tuning.  See t/002endtag.t, line 6.
        (lc($start) ne lc($end))) {
      push(@errors, { col  => $this->_pos($line, pos($line) - length($m)),
                      mesg => $msg,
                      elem => $start,
                      type => 'W',
                    },
          );
    }
  }

  # ---

  # We also allow a backslashed "/", they're common in eg. Perl regexps.
  # Consider
  #   $foo =~ s/bar/baz<br \/>/;
  while ($line =~ /
                  <                # TODO: Do we really need to see a known
                  ($All_Elems)     #       element here?
                  .*?
                  (\s?\\?\/?($Tag_End))
                  /giox) {
    my ($el, $end, $m) = ($1, $2);
    my $pos = $this->_pos($line, pos($line) - length($3));
    if ($end =~ m|/>$|o) {
      if ($this->{_xhtml} &&
          $el !~ /^$Compat_Elems$/io &&   # These don't apply here, see later.
          $end !~ m|\s\\?/|o) {
        push(@errors, { col  => $pos,
                        type => 'W',
                        mesg => 'use space before "/>" for compatibility',
                        elem => $el,
                      },
            );
      } elsif (! $this->{_xml} && $end =~ m|/>$|o) {
        push(@errors, { col  => $pos,
                        type => 'E',
                        mesg => 'element end "/>" is allowed in X(HT)ML only',
                        elem => $el,
                      },
            );
      }
    }
  }

  # ---

  # Check for missing " />".
  if ($this->{_xhtml}) {

    while ($line =~ /
                      <
                      ($Min_Elems)
                      .*?
                      (\/?$Tag_End)
                      /giox) {
      my ($el, $end) = ($1, $2);
      if ($end ne '/>') {
        push(@errors, { col  => $this->_pos($line, pos($line) - length($end)),
                        elem => $el,
                        mesg => 'missing " />"',
                        type => 'E',
                      },
            );
      }
    }

    while ($line =~ /
                      <
                      ($Compat_Elems)
                      .*?
                      (\s?.?$Tag_End)
                      /giox) {
      my ($el, $end) = ($1, $2);
      $msg = 'use "<%s></%s>" instead of <%s for compatibility';
      if ($end =~ m|(\s?/>)$|o) {
        my $e = lc($el);
        push(@errors, { col  => $this->_pos($line, pos($line) - length($end)),
                        elem => $el,
                        mesg => sprintf($msg, $e, $e, $e . $1),
                        type => 'W',
                      },
            );
      }
    }
  }

  return @errors;
}

# ----- Check: entities ----------------------------------------------------- #

# Check for unterminated entities in URIs (usually & instead of &amp;).
sub entities { return shift->_wrap('_entities', @_);}

sub _entities
{
  my ($this, $line) = @_;
  return () unless $this->{_html};

  # Optimization. "src=&" is the shortest we know of.
  return () unless $line =~ /\w{3}=./;

  my @errors = ();
  my $msg = 'unterminated entity: %s';

  while ($line =~ /
                  (?:^|\s)
                  ($URI_Attrs)
                  =
                  (
                    (.+?)
                    (?:
                      (?<!\[%) # Protect Template Toolkit's "[% ".
                      \s       # A space terminates here.
                      (?!%\])  # Protect Template Toolkit's " %]".
                      |
                      $Tag_End
                    )
                  )
                  /giox) {

    my ($attr, $pos, $val) = ($1, pos($line) - length($2), $3);

    while ($val =~ /(&([^;]*?))[=\"\'\#\s]/go) {
      push(@errors, { col =>
                      $this->_pos($line, $pos + pos($val) - length($2) - 1),
                      type => 'E',
                      mesg => sprintf($msg, $1),
                      attr => $attr,
                    },
          );
    }
  }

  return @errors;
}

# ----- Check: DOCTYPE ------------------------------------------------------ #

# Check for doctype declaration errors.
sub doctype { return shift->_wrap('_doctype', @_); }

sub _doctype
{
  my ($this, $line) = @_;
  my @errors = ();

  while ($line =~ /<!((DOCTYPE)\s+($Non_Tag_End+)>)/gio) {
    my ($pos, $dt, $rest) = (pos($line) - length($1), $2, $3);
    if ($dt ne "DOCTYPE") {
      push(@errors, { col  => $this->_pos($line, $pos),
                      type => 'E',
                      mesg => "DOCTYPE must be uppercase: $dt",
                    },
          );

      $pos = pos($line) - length($rest) - 1;

      if ($this->{_html} &&
          (my ($p1, $html, $t) = ($rest =~ /^((html)\s+)(\w+)?/io))) {

        # TODO: better message, maybe this should not be XHTML-only.
        if ($this->{_xhtml} && $html ne 'html') {
          my $msg = "\"html\" in DOCTYPE should be lowercase in XHTML: $html";
          push(@errors, { col  => $this->_pos($line, $pos),
                          type => 'W',
                          mesg => $msg,
                        },
              );
        }

        $pos += length($p1);

        if ($t =~ /^(PUBLIC|SYSTEM)$/io) {
          if ($t ne uc($t)) {
            my $msg = uc($t) . " must be uppercase: \"$t\"";
            push(@errors, { col  => $this->_pos($line, $pos),
                            type => 'E',
                            mesg => $msg,
                          },
                );

            if ($this->{_xml} && uc($t) eq 'PUBLIC') {
              # TODO: In XML, you can't declare public ID without
              # system ID.  Check this.
            }
          }
        } else {
          my $msg = "PUBLIC or SYSTEM should follow root element name: \"$t\"";
          push(@errors, { col  => $this->_pos($line, $pos),
                          type => 'W',
                          mesg => $msg,
                        },
              );
        }
      }
    }
  }

  return @errors;
}


# ---------- Accessors and mutators ----------------------------------------- #

sub mode
{
  my ($this, $mode) = @_;
  if ($mode) {
    my $was_xml = $this->{_xml};
    if ($mode eq 'HTML') {
      $this->{_xhtml} = 0;
      $this->{_xml}   = 0;
      $this->{_html}  = 1;
      $this->{_mode}  = $mode;
    } elsif ($mode eq 'XML') {
      $this->{_xhtml} = 0;
      $this->{_xml}   = 1;
      $this->{_html}  = 0;
      $this->{_mode}  = $mode;
      $this->quote('"') unless $was_xml;
    } elsif ($mode eq 'XHTML') {
      $this->{_xhtml} = 1;
      $this->{_xml}   = 1;
      $this->{_html}  = 1;
      $this->{_mode}  = $mode;
      $this->quote('"') unless $was_xml;
    } else {
      carp("** Mode must be one of XHTML, HTML, XML (resetting to XHTML)");
      $this->mode('XHTML');
    }
  }
  return $this->{_mode};
}

sub tab_width
{
  my ($this, $tw) = @_;
  if (defined($tw)) {
    if ($tw > 0) {
      $this->{_tab_width} = sprintf("%.0f", $tw); # Uh. Integers please.
    } else {
      carp("** TAB width must be > 0");
    }
  }
  return $this->{_tab_width};
}

sub min_attributes
{
  my ($this, $minattr) = @_;
  if (defined($minattr)) {
    if (! $minattr && $this->{_xml}) {
      carp("** Will not disable minimized attribute checks in " .
           $this->mode() . " mode");
    } else {
      $this->{_min_attrs} = $minattr;
    }
  }
  return $this->{_min_attrs};
}

sub stats
{
  my $this = shift;
  return ($this->{_num_errors}, $this->{_num_warnings});
}

sub reset
{
  my $this = shift;
  my ($e, $w) = $this->stats();
  $this->{_num_errors} = 0;
  $this->{_num_warnings} = 0;
  return ($e, $w);
}

sub quote
{
  my ($this, $q) = @_;
  if (defined($q)) {
    # We always allow " and ', and empty when non-xml, refuse others.
    my $is_ok = ($q eq '"'       || $q eq "'"   );
    $is_ok  ||= (! $this->{_xml} && ! length($q));
    if ($is_ok) {
      $this->{_quote} = $q;
    } else {
      carp("** Refusing to set quote to ", ($q || '[none]'),
           " when in " . $this->mode() . " mode");
    }
  }
  return $this->{_quote};
}

sub full_version
{
  return "HTML::GMUCK $VERSION";
}

# ---------- Utility methods ------------------------------------------------ #

sub _pos
{
  my ($this, $line, $pos) = @_;
  $pos = 0 unless (defined($pos) && $pos > 0);
  if ($this->{_tab_width} > 1 && $pos > 0) {
    my $pre = substr($line, 0, $pos);
    while ($pre =~ /\t/g) {
      $pos += $this->{_tab_width} - 1;
    }
  }
  return $pos;
}

sub _wrap
{
  my ($this, $method, @lines) = @_;
  my @errors = ();
  my $ln = 0;

  for (my $ln = 0; $ln < scalar(@lines); $ln++) {
    foreach my $err ($this->$method($lines[$ln])) {
      $err->{line}   = $ln;
      if (! $err->{mesg}) {
        $err->{mesg} = "no error message, looks like you found a bug";
        carp("** " . ucfirst($err->{mesg}));
      }
      $err->{col}  ||= 0;
      if (! $err->{type}) {
        carp("** No error type, looks like you found a bug");
        $err->{type} = '?';
      }
      push(@errors, $err);
      if ($err->{type} eq 'W') {
        $this->{_num_warnings}++;
      } else {
        $this->{_num_errors}++;
      }
    }
  }

  return @errors;
}

1;