HTML::HTML5::Microdata::ToRDFa - rewrite HTML5+Microdata into XHTML+RDFa


HTML-HTML5-Microdata-ToRDFa documentation Contained in the HTML-HTML5-Microdata-ToRDFa distribution.

Index


Code Index:

NAME

Top

HTML::HTML5::Microdata::ToRDFa - rewrite HTML5+Microdata into XHTML+RDFa

SYNOPSIS

Top

 use HTML::HTML5::Microdata::ToRDFa;
 my $rdfa = HTML::HTML5::Microdata::ToRDFa->new($html, $baseuri);
 print $rdfa->get_string;

DESCRIPTION

Top

This module may be used to convert HTML documents marked up with Microdata into XHTML+RDFa (which is more widely implemented by consuming software).

If the input document uses a mixture of Microdata and RDFa, the semantics of the output document may be incorrect.

Constructors

$rdfa = HTML::HTML5::Microdata::ToRDFa->new($html, $baseuri)

$html may be an HTML document (as a string) or an XML::LibXML::Document object.

$baseuri is the base URI for resolving relative URI references. If $html is undefined, then this module will fetch $baseuri to obtain the document to be converted.

Public Methods

$rdfa->get_string

Get the document converted to RDFa as a string. This will be well-formed XML, but not necessarily valid XHTML.

$rdfa->get_dom

Get the document converted to XHTML+RDFa as an XML::LibXML::Document object.

Note that each time get_string and get_dom are called, the conversion process is run from scratch. Repeatedly calling these methods is wasteful.

BUGS

Top

Please report any bugs to http://rt.cpan.org/.

SEE ALSO

Top

HTML::HTML5::Microdata::Parser, RDF::RDFa::Parser.

http://www.perlrdf.org/.

AUTHOR

Top

Toby Inkster <tobyink@cpan.org>.

COPYRIGHT AND LICENCE

Top


HTML-HTML5-Microdata-ToRDFa documentation Contained in the HTML-HTML5-Microdata-ToRDFa distribution.
package HTML::HTML5::Microdata::ToRDFa;

use 5.008;
use strict;

use Digest::SHA1 qw(sha1_hex);
use HTML::HTML5::Microdata::Parser 0.02;
use HTML::HTML5::Writer;
use RDF::Prefixes;
use URI::Escape;
use XML::LibXML qw(:all);

our $VERSION = '0.030';

sub new
{
	my ($class, $html, $base) = @_;
	
	my $self  = bless { 'bnodes'=>0, 'dom'=>undef, 'parser'=>undef, 'prefix'=>{} }, $class;
	
	$self->{'parser'} = HTML::HTML5::Microdata::Parser->new($html, $base);
	$self->{'prefix'} = RDF::Prefixes->new;
	$self->{'dom'}    = $self->{'parser'}->dom;
		
	return $self;
}

sub get_string
{
	my ($self, %options) = @_;
	
	my $advertisement;
	$advertisement = sprintf("\n<!--\n\t%s/%s\n\t%s/%s\n\t%s/%s\n\t%s/%s\n\t%s/%s\n -->\n",
		'HTML::HTML5::Microdata::ToRDFa'  => $HTML::HTML5::Microdata::ToRDFa::VERSION,
		'HTML::HTML5::Microdata::Parser'  => $HTML::HTML5::Microdata::Parser::VERSION,
		'HTML::HTML5::Writer'             => $HTML::HTML5::Writer::VERSION,
		'XML::LibXML'                     => $XML::LibXML::VERSION,
		'RDF::Prefixes'                   => $RDF::Prefixes::VERSION,
		)
		unless $options{no_advert};
	
	return HTML::HTML5::Writer
		->new(
			markup   => 'xhtml',
			polyglot => 1,
			doctype  => HTML::HTML5::Writer::DOCTYPE_XHTML_RDFA10 . $advertisement,
			)
		->document($self->get_dom);
}

sub get_dom
{
	my ($self) = @_;
	my $clone;
	
	# Is there a better way to clone an XML::LibXML::Document?
	{
		my $parser = XML::LibXML->new();
		$clone = $parser->parse_string( $self->{'dom'}->toString );
	}
	
	$self->_process_element($clone->documentElement, undef, $self->{'parser'}->uri);
	
	return $clone;
}

sub _process_element
{
	my ($self, $elem, $subject, $rdfa_subject) = @_;	
	my ($new_subject, $new_rdfa_subject);
	
	if ($elem->hasAttribute('itemscope'))
	{
		if ($elem->hasAttribute('itemid'))
		{
			$new_subject = $elem->getAttribute('itemid');
		}
		else
		{
			$new_subject = $self->_bnode;
		}
	}

	unless (defined $subject || defined $new_subject)
	{
		foreach my $attr (qw(itemprop itemtype itemid itemref))
		{
			$elem->removeAttribute($attr)
				if $elem->hasAttribute($attr);
		}
	}

	# This is complicated and annoying, but it's good to handle @itemref.
	# This technique should work for the vast majority of cases.
	if ($elem->hasAttribute('itemref') and $elem->hasAttribute('itemscope'))
	{
		my @new_nodes;
		$self->{'parser'}->set_callbacks({'ontriple'=>sub {
			my $parser  = shift;
			my $node    = shift;
			my $triple  = shift;
			
			# if $node is an element outside of $elem
			if ((substr $node->nodePath, 0, length $elem->nodePath) ne $elem->nodePath)
			{
				my $new = $elem->addNewChild('http://www.w3.org/1999/xhtml', 'span');
				$new->setAttribute('class', 'microdata-to-rdfa--itemref');
				push @new_nodes, $new;
				
				if ($triple->subject->is_blank)
				{
					$new->setAttribute('about' => '_:'.$triple->subject->blank_identifier);
				}
				else
				{
					$new->setAttribute('about' => $triple->subject->uri);
				}
				if ($triple->object->is_literal)
				{
					$new->setAttribute('property' => $self->_super_split($new, $triple->predicate->uri));
					$new->setAttribute('content'  => $triple->object->literal_value);
					$new->setAttribute('datatype' => $self->_super_split($new, $triple->object->literal_datatype))
						if $triple->object->has_datatype;
					$new->setAttribute('xml:lang' => $triple->object->literal_value_language)
						if $triple->object->has_language;
				}
				else
				{
					$new->setAttribute('rel' => $self->_super_split($new, $triple->predicate->uri));
					if ($triple->object->is_blank)
					{
						$new->setAttribute('resource' => '_:'.$triple->object->blank_identifier);
					}
					else
					{
						$new->setAttribute('resource' => $triple->object->uri);
					}
				}
			}
			
			return 1;
			}});
		my $new_uri = $self->{'parser'}->consume_microdata_item( $self->_get_orig_node($elem) );
		
		# consume_microdata_item would have issued a new blank node identifier
		# for the item. Let's write over that.
		foreach my $node (@new_nodes)
		{
			$node->setAttribute('about' => $subject)
				if $node->getAttribute('about') eq $new_uri;
			$node->setAttribute('resource' => $subject)
				if $node->getAttribute('resource') eq $new_uri;
		}
		
		$elem->removeAttribute('itemref');
	}

	$elem->removeAttribute('itemscope')
		if $elem->hasAttribute('itemscope');

	# This copes with <a href="..."><span itemprop="...">...</span></a>
	# and related. The @href shouldn't set a new subject in Microdata.
	$new_rdfa_subject = $elem->getAttribute('href')
		if $elem->hasAttribute('href')
		&& !$elem->hasAttribute('itemprop');
	$new_rdfa_subject = $elem->getAttribute('src')
		if $elem->hasAttribute('src')
		&& !$elem->hasAttribute('itemprop');

	if (defined $new_subject && !$elem->hasAttribute('itemprop'))
	{
		$elem->setAttribute('about' => $new_subject);
		$elem->removeAttribute('itemid')
			if $elem->hasAttribute('itemid');
		
		if ($elem->hasAttribute('itemtype'))
		{
			my ($expand, $prefix, $suffix) = $self->_split($elem->getAttribute('itemtype'));
			$elem->setAttribute('typeof' => "$prefix:$suffix");
			$elem->setAttribute("xmlns:$prefix" => $expand);
			$elem->removeAttribute('itemtype');
		}
	}

	elsif (defined $new_subject && $elem->hasAttribute('itemprop'))
	{
		$elem->setAttribute('resource' => $new_subject);
		$elem->removeAttribute('itemid')
			if $elem->hasAttribute('itemid');
		
		$elem->setAttribute(
			'rel' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
			);
		$elem->removeAttribute('itemprop');
		
		if ($elem->hasAttribute('itemtype'))
		{
			my $new = $elem->addNewChild('http://www.w3.org/1999/xhtml', 'span');
			$new->setAttribute('class', 'microdata-to-rdfa--rdftype');
			
			my ($expand, $prefix, $suffix) = $self->_split($elem->getAttribute('itemtype'));			
			$new->setAttribute('resource' => "[$prefix:$suffix]");
			$new->setAttribute("xmlns:$prefix" => $expand);

			($expand, $prefix, $suffix) = $self->_split('http://www.w3.org/1999/02/22-rdf-syntax-ns#type');
			$new->setAttribute('resource' => "[$prefix:$suffix]");
			$new->setAttribute("xmlns:$prefix" => $expand);
		}		
	}

	elsif ($elem->hasAttribute('itemprop'))
	{
		if ($elem->localname =~ /^(audio | embed | iframe | img | source | video)$/ix)
		{
			if ($elem->hasAttribute('src'))
			{
				$elem->setAttribute(
					'rel' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
				
				$elem->setAttribute('about' => $subject);
				$elem->setAttribute('resource' => $elem->getAttribute('src'));
			}
			else
			{
				$elem->setAttribute(
					'property' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
				
				$elem->setAttribute('about' => $subject);
				$elem->setAttribute('content' => '');
			}
		}
		elsif ($elem->localname =~ /^(a | area | link)$/ix)
		{
			if ($elem->hasAttribute('href'))
			{
				$elem->setAttribute(
					'rel' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
			}
			else
			{
				$elem->setAttribute(
					'property' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
				
				$elem->setAttribute('content' => '');
			}
		}
		elsif ($elem->localname =~ /^(object)$/ix)
		{
			if ($elem->hasAttribute('data'))
			{
				$elem->setAttribute(
					'rel' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
				$elem->setAttribute('resource' => $elem->getAttribute('data'));
			}
			else
			{
				$elem->setAttribute(
					'property' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
					);
				$elem->removeAttribute('itemprop');
				$elem->setAttribute('content' => '');
			}
		}
		else
		{
			$elem->setAttribute(
				'property' => $self->_super_split($elem, $elem->getAttribute('itemprop'))
				);
			$elem->removeAttribute('itemprop');
			$elem->setAttribute('datatype' => '')
				if $elem->getChildrenByTagName('*');
		}
	}

	if ($subject ne $rdfa_subject
	and ($elem->hasAttribute('rel') || $elem->hasAttribute('property'))
	and !$elem->hasAttribute('about'))
	{
		$elem->setAttribute('about' => $subject);
	}
	
	foreach my $kid ($elem->getChildrenByTagName('*'))
	{
		$self->_process_element($kid, $new_subject||$subject, $new_rdfa_subject||$rdfa_subject);
	}
}

sub _split
{
	my ($self, $uri) = @_;
	
	my $curie = $self->{prefix}->get_curie($uri);
	my ($prefix, $suffix) = split /:/, $curie, 2;
	
	return ($self->{prefix}->to_hashref->{$prefix}, $prefix, $suffix);
}

sub _super_split
{
	my ($self, $elem, $str) = @_;
	
	my $type = $self->_get_node_type( $self->_get_orig_node($elem) );
	
	my @rv;
	my @props = split /\s+/, $str;
	
	foreach my $p (@props)
	{
		if ($p =~ /:/)
		{
			my ($expand, $prefix, $suffix) = $self->_split($p);
			$elem->setAttribute("xmlns:$prefix" => $expand);
			push @rv, "$prefix:$suffix";
		}
		
		elsif (defined $type and length $p)
		{
			my $_p = $type;
			$_p .= '#' unless $_p =~ /#/;
			$_p .= ':';
			$_p  = "http://www.w3.org/1999/xhtml/microdata#" . uri_escape($_p . uri_escape($p));
			my ($expand, $prefix, $suffix) = $self->_split($_p);
			$elem->setAttribute("xmlns:$prefix" => $expand);
			push @rv, "$prefix:$suffix";
		}
	}
	
	return join ' ', @rv;
}

sub _get_orig_node
{
	my ($self, $node) = @_;
	
	my @matches = $self->{'dom'}->documentElement->findnodes( $node->nodePath );
	return $matches[0];
}

sub _get_node_type
{
	my ($self, $node) = @_;
	
	return undef unless $node;
	return undef unless $node->nodeType == XML_ELEMENT_NODE;
	
	return $node->getAttribute('itemtype')
		if $node->hasAttribute('itemtype');
	
	return $self->_get_node_type($node->parentNode)
		if ($node != $self->{'dom'}->documentElement
		and defined $node->parentNode
		and $node->parentNode->nodeType == XML_ELEMENT_NODE);
	
	return undef;
}

sub _bnode
{
	my ($self) = @_;
	return sprintf('_:HTMLAutoNode%03d', $self->{bnodes}++);
}

1;

__DATA__
dcterms	http://purl.org/dc/terms/
eg	http://example.com/
foaf	http://xmlns.com/foaf/0.1/
md	http://www.w3.org/1999/xhtml/microdata#
owl	http://www.w3.org/2002/07/owl#
rdf	http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs	http://www.w3.org/2000/01/rdf-schema#
rss	http://purl.org/rss/1.0/
sioc	http://rdfs.org/sioc/ns#
skos	http://www.w3.org/2004/02/skos/core#
xhv	http://www.w3.org/1999/xhtml/vocab#
xsd	http://www.w3.org/2001/XMLSchema#
__END__