HTML::DOMbo - convert between XML::DOM and {XML/HTML}::Element trees


HTML-DOMbo documentation Contained in the HTML-DOMbo distribution.

Index


Code Index:

NAME

Top

HTML::DOMbo -- convert between XML::DOM and {XML/HTML}::Element trees

SYNOPSIS

Top

  use HTML::DOMbo;
  use HTML::TreeBuilder;
  my $tree = HTML::TreeBuilder->new;
  $tree->parse_from_file('foo.html');
  my $dom_tree = $tree->to_XML_DOM;
  # Now you have a DOM element in $dom_tree!

DESCRIPTION

Top

This class puts a method into HTML::Element called to_XML_DOM, and puts into the class XML::DOM::Node two methods, to_HTML_Element and to_XML_Element.

to_XML_DOM

The class HTML::TreeBuilder robustly produces parse trees of HTML, but the trees are made of HTML::Element objects, not W3C DOM objects. If you want to access a TreeBuilder-made parse tree (in $tree) with a DOM interface, use HTML::DOMbo and then call

  my $dom_tree = $tree->to_XML_DOM;

This returns a new object of the appropriate class (presumably XML::DOM::Element), in a new DOM document, having the same structure and content as the given HTML::TreeBuilder/Element tree. If you want the elements to be instantiated against an existing document object, instead call:

  my $dom_tree = $tree->to_XML_DOM($existing_dom_document);

to_HTML_Element and to_XML_Element

This module provides two experimental methods (in the XML::DOM::Node class) called to_HTML_Element and to_XML_Element, which clone a DOM node (or DOM document, or document fragment) as a new HTML::Element or XML::Element object. You need to have the XML::Element module (from the XML::TreeBuilder dist) installed in order to use the to_XML_Element method.

It is possible for this to throw a fatal exception. And it it possible for this to return a text string instead (if the DOM node given was a text node). Moreover, in list context it may return any number of items, if the source object is a document fragment containing more than one top-level node, or no nodes.

Users are encouraged to report to me any problems (or successes) in using this method. The behavior of this method may change in response to your requests.

SEE ALSO

Top

XML::DOM, HTML::TreeBuilder, HTML::Element, XML::Element.

COPYRIGHT

Top

AUTHOR

Top

Sean M. Burke, <sburke@cpan.org>


HTML-DOMbo documentation Contained in the HTML-DOMbo distribution.

require 5;
# Time-stamp: "2005-01-04 21:16:40 AST"
package HTML::DOMbo;
use strict;
use vars qw($VERSION);
$VERSION = '3.10';

use HTML::Element (); # just for sanity's sake
use XML::DOM; # import all the nice constants
use Carp ();

BEGIN { eval('sub DEBUG () {0}') unless defined &DEBUG; }

#---------------------------------------------------------------------------

# Types of things to handle:
# UNKNOWN_NODE (0)                The node type is unknown (not part of DOM)
#
# ELEMENT_NODE (1)                The node is an Element.
# ATTRIBUTE_NODE (2)              The node is an Attr.
# TEXT_NODE (3)                   The node is a Text node.
# CDATA_SECTION_NODE (4)          The node is a CDATASection.
# ENTITY_REFERENCE_NODE (5)       The node is an EntityReference.
# ENTITY_NODE (6)                 The node is an Entity.
# PROCESSING_INSTRUCTION_NODE (7) The node is a ProcessingInstruction.
# COMMENT_NODE (8)                The node is a Comment.
# DOCUMENT_NODE (9)               The node is a Document.
# DOCUMENT_TYPE_NODE (10)         The node is a DocumentType.
# DOCUMENT_FRAGMENT_NODE (11)     The node is a DocumentFragment.
# NOTATION_NODE (12)              The node is a Notation.
#
# ELEMENT_DECL_NODE (13)          The node is an ElementDecl (not part of DOM)
# ATT_DEF_NODE (14)               The node is an AttDef (not part of DOM)
# XML_DECL_NODE (15)              The node is an XMLDecl (not part of DOM)
#  ATTLIST_DECL_NODE (16)          The node is an AttlistDecl (not part of DOM)

sub XML::DOM::Node::to_XML_Element {
  require XML::Element;
  if(@_ < 2) {
    $_[0]->to_HTML_Element('XML::Element');
  } else {
    shift->to_HTML_Element(@_); # just an alias, then
  }
}

sub XML::DOM::Node::to_HTML_Element { # recursive method
  my $in = $_[0];
  my $element_class = ref($_[1] || '') || $_[1] || 'HTML::Element';

  #print "Input object $in\n";

  Carp::croak "What DOM node?" unless ref $in;
  Carp::croak "$in isn't a DOM node" unless $in->can('getNodeType');
  
  my $type = $in->getNodeType;
  
  if($type == DOCUMENT_FRAGMENT_NODE) {
    my(@c) = $type->getChildNodes;
    if(wantarray) {
      if(@c == 0) {
        return();
      } elsif(@c > 1) {
        return map $_->to_HTML_Element($element_class), @c;
      }
       # else fall thru
    } else {
      if(@c == 0) {
        return undef; # empty fragment!
      } elsif(@c == 1) {
        $in = $c[0];
        $type = $in->getNodeType; #update
      }
       # else fall thru
    }
  }
  
  if($type == DOCUMENT_NODE) {
    $in = $in->getDocumentElement()
     || Carp::croak "Document has no DocumentElement?"; # sanity
    $type = $in->getNodeType; #update
  }
  
  my $out;
  if($type == ELEMENT_NODE) {
    # What did we ever do to deserve such a bungled mess as the DOM?
    # The whole DOM looks like it was drafted in crayon-scrabbled Java
    #  pseudocode after a night spent huffing glue while listening to
    #  Def Leppard.
    #
    # Just look at this steaming mess of code it takes to get all an object's
    #  attributes!
    my(@attrs);
    my $attr_map = $in->getAttributes;
    my $i;
    my $this_attr;
    if($attr_map and $i = $attr_map->getLength) {
      for(my $j = 0; $j < $i; ++$j) {
        $this_attr = $attr_map->item($j);
        #print "    <",$this_attr->getName,'><', $this_attr->getValue, ">\n";
        push @attrs, $this_attr->getName, $this_attr->getValue;
      }
    }
    $out = $element_class->new($in->getTagName(), @attrs);
    
  } elsif($type == TEXT_NODE
       or $type == CDATA_SECTION_NODE
  ) {
    $out = $in->getNodeValue; # yes, just text!

  } elsif($type == ENTITY_REFERENCE_NODE) {
    $out = $in->getData;      # yes, just text!
    $out = '' unless defined $out; # sanity
  } elsif($type == COMMENT_NODE) {
    $out = $element_class->new('~comment', 'text', $in->getData);
    
  } elsif($type == PROCESSING_INSTRUCTION_NODE) {
    $out = $element_class->new('~pi',
                            'text', join(' ', $in->getTarget, $in->getData)
                           );
    
  } elsif($type == DOCUMENT_FRAGMENT_NODE) {
    # a fake-o div.
    $out = $element_class->new('div', '_implicit' => 1);
    
  } else {
    #Carp::croak "I don't know how to handle objects like $in ($type)";
    print "I don't know how to handle objects like $in ($type)" if $^W;
    return;
  }
  #TODO: Declarations?
  
  # Now attach children
  foreach my $c ($in->getChildNodes) {
    die "Trying to put children on a CDATA, Text, or EntityReference node!"
      unless ref $out;
     # Sanity.  But could entity references be children of Text?
    $out->push_content( $c->to_HTML_Element ); # RECURSE!
  }
  
  return $out;
}

#---------------------------------------------------------------------------

sub HTML::Element::to_XML_DOM { # recursive method
  my($in, $doc) = @_;
  Carp::croak "What element?" unless $in and ref $in;
  $doc ||= XML::DOM::Document->new();

  my $out;
  my $tag = $in->tag;
  # Make a DOM clone of this node:
  {
    # Consider the different kinds of HTML::Element objects,
    #  which are distinguished not by their class, but by their
    #  "tag" (GI) attribute:

    DEBUG && print "+ $tag\n";
    die "No tag for $in?" unless defined $tag and length $tag;
     # enforce minimal sanity
    my($k,$v); # scratch

    if($tag eq '~literal') {
      Carp::croak "Can't put a ~literal into a DOM tree";
      # No, it's not the same as a CDATA.  ~literals are a hack.
      
    } elsif($tag eq '~declaration') {
      # Might as well ignore?

    } elsif($tag eq '~pi') {
      $k = $in->attr('text');
      if($k =~ m<^\s*(\S+)\s+(.*)$>s) {
        $out = $doc->createProcessingInstruction($1,$2);
      } elsif($k =~ m<^\s*(\S+)>s) { # minimal sanity?
        $out = $doc->createProcessingInstruction($1,'');
      } else {
        return; # give up
      }

    } elsif($tag eq '~comment') {
      $k = $in->attr('text');
      $k = join(' ', @$k) if ref($k) eq 'ARRAY';  # never used?
      $out = $doc->createComment($k);

    } else {
      # It's a normal element!

      $out = $doc->createElement($tag);
       # An exception will be thrown there if $tag isn't a legal
       #  XML element name.
      my @attrs = $in->all_external_attr();
      while(@attrs) {
        ($k,$v) = splice @attrs,0,2;
        next if $k eq '/'; # hack.
        DEBUG && print "    attr <$k><$v>\n";
        $out->setAttribute($k,$v);
         # An exception will be thrown there if $k isn't a legal
         #  attribute name.
      }
    }
  }

  # Now, recursively, make and attach children.
  {
    my $new_c; #scratch
    foreach my $c ($in->content_list) {
      if(ref($c)) {
        $new_c = $c->to_XML_DOM($doc) || next;
      } else {
        $new_c = $doc->createTextNode($c);
      }
      $out->appendChild($new_c); # and attach
    }
    # Could conceivably throw an exception if you've done
    #  something bone stupid like put a child under a
    #  comment node.
  }
  
  DEBUG && print "- $tag\n";
  return $out;
}

#---------------------------------------------------------------------------

1;

__END__