HTML::Selector::XPath - CSS Selector to XPath compiler


HTML-Selector-XPath documentation Contained in the HTML-Selector-XPath distribution.

Index


Code Index:

NAME

Top

HTML::Selector::XPath - CSS Selector to XPath compiler

SYNOPSIS

Top

  use HTML::Selector::XPath;

  my $selector = HTML::Selector::XPath->new("li#main");
  $selector->to_xpath; # //li[@id='main']

  # functional interface
  use HTML::Selector::XPath 'selector_to_xpath';
  my $xpath = selector_to_xpath('div.foo');

  my $relative = selector_to_xpath('div.foo', root => '/html/body/p' );
  # /html/body/p/div[contains(concat(' ', @class, ' '), ' foo ')]

DESCRIPTION

Top

HTML::Selector::XPath is a utility function to compile full set of CSS2 and partial CSS3 selectors to the equivalent XPath expression.

FUNCTIONS and METHODS

Top

selector_to_xpath
  $xpath = selector_to_xpath($selector);

Shortcut for HTML::Selector->new(shift)->to_xpath(@_). Exported upon request.

new
  $sel = HTML::Selector::XPath->new($selector);

Creates a new object.

to_xpath
  $xpath = $sel->to_xpath;
  $xpath = $sel->to_xpath(root => "."); # ./foo instead of //foo

Returns the translated XPath expression. You can optionally pass root parameter, to specify which root to start the expression. It defaults to /.

CAVEATS

Top

CSS SELECTOR VALIDATION

This module doesn't validate whether the original CSS Selector expression is valid. For example,

  div.123foo

is an invalid CSS selector (class names should not begin with numbers), but this module ignores that and tries to generate an equivalent XPath expression anyway.

COPYRIGHT

Top

AUTHOR

Top

Tatsuhiko Miyagawa <miyagawa@bulknews.net>

Most of the logic is based on Joe Hewitt's getElementsBySelector.js on http://www.joehewitt.com/blog/2006-03-20.php and Andrew Dupont's patch to Prototype.js on http://dev.rubyonrails.org/ticket/5171, but slightly modified using Aristotle Pegaltzis' CSS to XPath translation table per http://plasmasturm.org/log/444/

LICENSE

Top

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

SEE ALSO

Top

http://www.w3.org/TR/REC-CSS2/selector.html http://use.perl.org/~miyagawa/journal/31090


HTML-Selector-XPath documentation Contained in the HTML-Selector-XPath distribution.

package HTML::Selector::XPath;

use strict;
use 5.008_001;
our $VERSION = '0.07';

require Exporter;
our @EXPORT_OK = qw(selector_to_xpath);
*import = \&Exporter::import;

use Carp;

sub selector_to_xpath {
    __PACKAGE__->new(shift)->to_xpath(@_);
}

my $reg = {
    # tag name/id/class
    element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
    # attribute presence
    attr1   => qr/^\[([^\]]*)\]/,
    # attribute value match
    attr2   => qr/^\[\s*([^*~\|=\s:^\$]+)\s*([~\|*^\$]?=)\s*"([^"]+)"\s*\]/i,
    attrN   => qr/^:not\((.*?)\)/i,
    pseudo  => qr/^:([()a-z0-9_-]+)/i,
    # adjacency/direct descendance
    combinator => qr/^(\s*[>+~\s])/i,
    # rule separator
    comma => qr/^\s*,/i,
};


sub new {
    my($class, $exp) = @_;
    bless { expression => $exp }, $class;
}

sub selector {
    my $self = shift;
    $self->{expression} = shift if @_;
    $self->{expression};
}

sub convert_attribute_match {
    my ($left,$op,$right) = @_;
    # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
    if ($op eq '!=') {
        "\@$left!='$right";
    } elsif ($op eq '~=') { # substring attribute match
        "contains(concat(' ', \@$left, ' '), ' $right ')";
    } elsif ($op eq '*=') { # real substring attribute match
        "contains($left, '$right')";
    } elsif ($op eq '|=') {
        "\@$left='$right' or starts-with(\@$left, '$right-')";
    } elsif ($op eq '^=') {
        "starts-with(\@$left,'$3')";
    } elsif ($op eq '$=') {
        "ends-with(\@$left,'$3')";
    } else { # exact match
        "\@$left='$3'";
    }
};

sub to_xpath {
    my $self = shift;
    my $rule = $self->{expression} or return;
    my %parms = @_;
    my $root = $parms{root} || '/';

    my @parts = ("$root/");
    my $last_rule = '';
    my @next_parts;

    my $tag;
    my $wrote_tag;
    my $tag_index;
    # Loop through each "unit" of the rule
    while (length $rule && $rule ne $last_rule) {
        $last_rule = $rule;

        $rule =~ s/^\s*|\s*$//g;
        last unless length $rule;

        # Prepend explicit first selector if we have an implicit selector
        # (that is, if we start with a combinator)
        if ($rule =~ /$reg->{combinator}/) {
            $rule = "* $rule";
        };

        # Match elements
        if ($rule =~ s/$reg->{element}//) {
            my ($id_class,$name,$lang) = ($1,$2,$3);

            # to add *[1]/self:: for follow-sibling
            if (@next_parts) {
                push @parts, @next_parts; #, (pop @parts);
                @next_parts = ();
            }

            if ($id_class eq '') {
                $tag = $name || '*';
            } else {
                $tag = '*';
            }
            if (! $wrote_tag++) {
                push @parts, $tag;
                $tag_index = $#parts;
            };

            # XXX Shouldn't the RE allow both, ID and class?
            if ($id_class eq '#') { # ID
                push @parts, "[\@id='$name']";
            } elsif ($id_class eq '.') { # class
                push @parts, "[contains(concat(' ', \@class, ' '), ' $name ')]";
            };
        };

        # Match attribute selectors
        if ($rule =~ s/$reg->{attr2}//) {
            push @parts, "[", convert_attribute_match( $1, $2, $3 ), "]";
        } elsif ($rule =~ s/$reg->{attr1}//) {
            # If we have no tag output yet, write the tag:
            if (! $wrote_tag++) {
                push @parts, '*';
                $tag_index = $#parts;
            };
            push @parts, "[\@$1]";
        }

        # Match negation
        if ($rule =~ s/$reg->{attrN}//) {
            my $sub_rule = $1;
            if ($sub_rule =~ s/$reg->{attr2}//) {
                push @parts, "[not(", convert_attribute_match( $1, $2, $3 ), ")]";
            } elsif ($sub_rule =~ s/$reg->{attr1}//) {
                push @parts, "[not(\@$1)]";
            } else {
                Carp::croak "Can't translate '$sub_rule' inside :not()";
            }
        }

        # Ignore pseudoclasses/pseudoelements
        while ($rule =~ s/$reg->{pseudo}//) {
            if ( $1 eq 'first-child') {
                #$parts[$#parts] = '*[1]/self::' . $parts[$#parts];
                # Replace the start of our current rule with a rule
                # enforcing the current child
                $parts[$tag_index] = '*[1]/self::' . $parts[$tag_index];
            } elsif ( $1 eq 'last-child') {
                push @parts, '[not(following-sibling::*)]';
            } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
                push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
            } elsif ($1 =~ /^nth-child\((\d+)\)$/) {
                push @parts, "[count(preceding-sibling::*) = @{[ $1 - 1 ]}]";
            } elsif ($1 =~ /^first-of-type$/) {
                push @parts, "[1]";
            } elsif ($1 =~ /^nth-of-type\((\d+)\)$/) {
                push @parts, "[$1]";
            } elsif ($1 =~ /^contains\($/) {
                $rule =~ s/^\s*"([^"]*)"\s*\)\s*$//
                    or die "Malformed string in :contains(): '$rule'";
                push @parts, qq{[text()[contains(string(.),"$1")]]};
            } elsif ( $1 eq 'root') {
                # This will give surprising results if you do E > F:root
                $parts[0] = "/";
            } elsif ( $1 eq 'empty') {
                push @parts, "[not(* or text())]";
            } else {
                Carp::croak "Can't translate '$1' pseudo-class";
            }
        }

        # Match combinators (>, + and ~)
        if ($rule =~ s/$reg->{combinator}//) {
            my $match = $1;
            if ($match =~ />/) {
                push @parts, "/";
            } elsif ($match =~ /\+/) {
                push @parts, "/following-sibling::*[1]/self::";
                $tag_index = $#parts;
            } elsif ($match =~ /\~/) {
                push @parts, "/following-sibling::";
            } else {
                push @parts, "//";
            }

            # new context
            undef $tag;
            undef $wrote_tag;
        }

        # Match commas
        if ($rule =~ s/$reg->{comma}//) {
            push @parts, " | ", "$root/"; # ending one rule and beginning another
            undef $tag;
            undef $wrote_tag;
        }
    }
    return join '', @parts;
}

1;
__END__