| HTML-Selector-XPath documentation | Contained in the HTML-Selector-XPath distribution. |
HTML::Selector::XPath - CSS Selector to XPath compiler
use HTML::Selector::XPath;
my $selector = HTML::Selector::XPath->new("li#main");
$selector->to_xpath; # //li[@id='main']
# functional interface
use HTML::Selector::XPath 'selector_to_xpath';
my $xpath = selector_to_xpath('div.foo');
my $relative = selector_to_xpath('div.foo', root => '/html/body/p' );
# /html/body/p/div[contains(concat(' ', @class, ' '), ' foo ')]
HTML::Selector::XPath is a utility function to compile full set of CSS2 and partial CSS3 selectors to the equivalent XPath expression.
$xpath = selector_to_xpath($selector);
Shortcut for HTML::Selector->new(shift)->to_xpath(@_). Exported upon request.
$sel = HTML::Selector::XPath->new($selector);
Creates a new object.
$xpath = $sel->to_xpath; $xpath = $sel->to_xpath(root => "."); # ./foo instead of //foo
Returns the translated XPath expression. You can optionally pass
root parameter, to specify which root to start the expression. It
defaults to /.
This module doesn't validate whether the original CSS Selector expression is valid. For example,
div.123foo
is an invalid CSS selector (class names should not begin with numbers), but this module ignores that and tries to generate an equivalent XPath expression anyway.
Tatsuhiko Miyagawa 2006-2011
Max Maischein 2011-
Tatsuhiko Miyagawa <miyagawa@bulknews.net>
Most of the logic is based on Joe Hewitt's getElementsBySelector.js on http://www.joehewitt.com/blog/2006-03-20.php and Andrew Dupont's patch to Prototype.js on http://dev.rubyonrails.org/ticket/5171, but slightly modified using Aristotle Pegaltzis' CSS to XPath translation table per http://plasmasturm.org/log/444/
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| HTML-Selector-XPath documentation | Contained in the HTML-Selector-XPath distribution. |
package HTML::Selector::XPath; use strict; use 5.008_001; our $VERSION = '0.07'; require Exporter; our @EXPORT_OK = qw(selector_to_xpath); *import = \&Exporter::import; use Carp; sub selector_to_xpath { __PACKAGE__->new(shift)->to_xpath(@_); } my $reg = { # tag name/id/class element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i, # attribute presence attr1 => qr/^\[([^\]]*)\]/, # attribute value match attr2 => qr/^\[\s*([^*~\|=\s:^\$]+)\s*([~\|*^\$]?=)\s*"([^"]+)"\s*\]/i, attrN => qr/^:not\((.*?)\)/i, pseudo => qr/^:([()a-z0-9_-]+)/i, # adjacency/direct descendance combinator => qr/^(\s*[>+~\s])/i, # rule separator comma => qr/^\s*,/i, }; sub new { my($class, $exp) = @_; bless { expression => $exp }, $class; } sub selector { my $self = shift; $self->{expression} = shift if @_; $self->{expression}; } sub convert_attribute_match { my ($left,$op,$right) = @_; # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway: if ($op eq '!=') { "\@$left!='$right"; } elsif ($op eq '~=') { # substring attribute match "contains(concat(' ', \@$left, ' '), ' $right ')"; } elsif ($op eq '*=') { # real substring attribute match "contains($left, '$right')"; } elsif ($op eq '|=') { "\@$left='$right' or starts-with(\@$left, '$right-')"; } elsif ($op eq '^=') { "starts-with(\@$left,'$3')"; } elsif ($op eq '$=') { "ends-with(\@$left,'$3')"; } else { # exact match "\@$left='$3'"; } }; sub to_xpath { my $self = shift; my $rule = $self->{expression} or return; my %parms = @_; my $root = $parms{root} || '/'; my @parts = ("$root/"); my $last_rule = ''; my @next_parts; my $tag; my $wrote_tag; my $tag_index; # Loop through each "unit" of the rule while (length $rule && $rule ne $last_rule) { $last_rule = $rule; $rule =~ s/^\s*|\s*$//g; last unless length $rule; # Prepend explicit first selector if we have an implicit selector # (that is, if we start with a combinator) if ($rule =~ /$reg->{combinator}/) { $rule = "* $rule"; }; # Match elements if ($rule =~ s/$reg->{element}//) { my ($id_class,$name,$lang) = ($1,$2,$3); # to add *[1]/self:: for follow-sibling if (@next_parts) { push @parts, @next_parts; #, (pop @parts); @next_parts = (); } if ($id_class eq '') { $tag = $name || '*'; } else { $tag = '*'; } if (! $wrote_tag++) { push @parts, $tag; $tag_index = $#parts; }; # XXX Shouldn't the RE allow both, ID and class? if ($id_class eq '#') { # ID push @parts, "[\@id='$name']"; } elsif ($id_class eq '.') { # class push @parts, "[contains(concat(' ', \@class, ' '), ' $name ')]"; }; }; # Match attribute selectors if ($rule =~ s/$reg->{attr2}//) { push @parts, "[", convert_attribute_match( $1, $2, $3 ), "]"; } elsif ($rule =~ s/$reg->{attr1}//) { # If we have no tag output yet, write the tag: if (! $wrote_tag++) { push @parts, '*'; $tag_index = $#parts; }; push @parts, "[\@$1]"; } # Match negation if ($rule =~ s/$reg->{attrN}//) { my $sub_rule = $1; if ($sub_rule =~ s/$reg->{attr2}//) { push @parts, "[not(", convert_attribute_match( $1, $2, $3 ), ")]"; } elsif ($sub_rule =~ s/$reg->{attr1}//) { push @parts, "[not(\@$1)]"; } else { Carp::croak "Can't translate '$sub_rule' inside :not()"; } } # Ignore pseudoclasses/pseudoelements while ($rule =~ s/$reg->{pseudo}//) { if ( $1 eq 'first-child') { #$parts[$#parts] = '*[1]/self::' . $parts[$#parts]; # Replace the start of our current rule with a rule # enforcing the current child $parts[$tag_index] = '*[1]/self::' . $parts[$tag_index]; } elsif ( $1 eq 'last-child') { push @parts, '[not(following-sibling::*)]'; } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) { push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]"; } elsif ($1 =~ /^nth-child\((\d+)\)$/) { push @parts, "[count(preceding-sibling::*) = @{[ $1 - 1 ]}]"; } elsif ($1 =~ /^first-of-type$/) { push @parts, "[1]"; } elsif ($1 =~ /^nth-of-type\((\d+)\)$/) { push @parts, "[$1]"; } elsif ($1 =~ /^contains\($/) { $rule =~ s/^\s*"([^"]*)"\s*\)\s*$// or die "Malformed string in :contains(): '$rule'"; push @parts, qq{[text()[contains(string(.),"$1")]]}; } elsif ( $1 eq 'root') { # This will give surprising results if you do E > F:root $parts[0] = "/"; } elsif ( $1 eq 'empty') { push @parts, "[not(* or text())]"; } else { Carp::croak "Can't translate '$1' pseudo-class"; } } # Match combinators (>, + and ~) if ($rule =~ s/$reg->{combinator}//) { my $match = $1; if ($match =~ />/) { push @parts, "/"; } elsif ($match =~ /\+/) { push @parts, "/following-sibling::*[1]/self::"; $tag_index = $#parts; } elsif ($match =~ /\~/) { push @parts, "/following-sibling::"; } else { push @parts, "//"; } # new context undef $tag; undef $wrote_tag; } # Match commas if ($rule =~ s/$reg->{comma}//) { push @parts, " | ", "$root/"; # ending one rule and beginning another undef $tag; undef $wrote_tag; } } return join '', @parts; } 1; __END__