PPIx::Regexp::Token::Structure - Represent structural elements.


PPIx-Regexp documentation Contained in the PPIx-Regexp distribution.

Index


Code Index:

NAME

Top

PPIx::Regexp::Token::Structure - Represent structural elements.

SYNOPSIS

Top

 use PPIx::Regexp::Dumper;
 PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
     ->print();

INHERITANCE

Top

PPIx::Regexp::Token::Structure is a PPIx::Regexp::Token.

PPIx::Regexp::Token::Structure is the parent of PPIx::Regexp::Token::Delimiter.

DESCRIPTION

Top

This class represents things that define the structure of the regular expression. This typically means brackets of various sorts, but to prevent proliferation of token classes the type of the regular expression is stored here.

METHODS

Top

This class provides no public methods beyond those provided by its superclass.

SUPPORT

Top

Support is by the author. Please file bug reports at http://rt.cpan.org, or in electronic mail to the author.

AUTHOR

Top

Thomas R. Wyant, III wyant at cpan dot org

COPYRIGHT AND LICENSE

Top


PPIx-Regexp documentation Contained in the PPIx-Regexp distribution.
package PPIx::Regexp::Token::Structure;

use strict;
use warnings;

use base qw{ PPIx::Regexp::Token };

use PPIx::Regexp::Constant qw{
    COOKIE_CLASS
    COOKIE_QUANT
    MINIMUM_PERL
    TOKEN_LITERAL
};

# Tokens we are responsible for making, under at least some
# circumstances.
use PPIx::Regexp::Token::Comment	();
use PPIx::Regexp::Token::Modifier	();
use PPIx::Regexp::Token::Backreference	();
use PPIx::Regexp::Token::Backtrack	();
use PPIx::Regexp::Token::Recursion	();

our $VERSION = '0.020';

# Return true if the token can be quantified, and false otherwise

my %quant = map { $_ => 1 } ')', ']';
sub can_be_quantified {
    my ( $self ) = @_;
    ref $self or return;
    return $quant{ $self->content() };
};

sub is_quantifier {
    my ( $self ) = @_;
    ref $self or return;
    return $self->{is_quantifier};
}

{

    # Note that the implementation equivocates on the ::Token::Structure
    # class, using it both for the initial token that determines the
    # type of the regex and things like parentheses internal to the
    # regex. Rather than sort out this equivocation, I have relied on
    # the currently-true assumption that 'qr' will not satisfy the
    # ::Token::Structure recognition logic, and the only way this class
    # can acquire this content is by the brute-force approach used to
    # generate the initial token object.

    my %perl_version_introduced = (
	qr	=> '5.005',
    );

    sub perl_version_introduced {
	my ( $self ) = @_;
	return $perl_version_introduced{ $self->content() } || MINIMUM_PERL;
    }
}

{

    my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;

    # Regular expressions to match various parenthesized tokens, and the
    # classes to make them into.

    my @paren_token = map {
	[ $_ => $_->__PPIX_TOKEN__recognize() ]
    }
	'PPIx::Regexp::Token::Comment',
	'PPIx::Regexp::Token::Modifier',
	'PPIx::Regexp::Token::Backreference',
	'PPIx::Regexp::Token::Backtrack',
	'PPIx::Regexp::Token::Recursion',
    ;

    sub __PPIX_TOKENIZER__regexp {
	my ( $class, $tokenizer, $character ) = @_;

	# We are not interested in anything but delimiters.
	$delim{$character} or return;

	# Inside a character class, all the delimiters are normal characters
	# except for the close square bracket.
	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
	    $character eq ']'
		or return $tokenizer->make_token( 1, TOKEN_LITERAL );
	}

	# Open parentheses have various interesting possibilities ...
	if ( $character eq '(' ) {

	    # Sometimes the whole bunch of parenthesized characters seems
	    # naturally to be a token.
	    foreach ( @paren_token ) {
		my ( $class, @recognize ) = @{ $_ };
		foreach ( @recognize ) {
		    my ( $regexp, $arg ) = @{ $_ };
		    my $accept = $tokenizer->find_regexp( $regexp ) or next;
		    return $tokenizer->make_token( $accept, $class, $arg );
		}
	    }

	    # We expect certain tokens only after a left paren.
	    $tokenizer->expect(
		'PPIx::Regexp::Token::GroupType::Modifier',
		'PPIx::Regexp::Token::GroupType::NamedCapture',
		'PPIx::Regexp::Token::GroupType::Assertion',
		'PPIx::Regexp::Token::GroupType::Code',
		'PPIx::Regexp::Token::GroupType::BranchReset',
		'PPIx::Regexp::Token::GroupType::Subexpression',
		'PPIx::Regexp::Token::GroupType::Switch',
	    );

	    # Modifier changes are local to this parenthesis group
	    $tokenizer->modifier_duplicate();

	    # Accept the parenthesis.
	    return 1;
	}

	# Close parentheses end modifier localization
	if ( $character eq ')' ) {
	    $tokenizer->modifier_pop();
	    return 1;
	}

	# Open curlys are complicated because they may or may not represent
	# the beginning of a quantifier, depending on what comes before the
	# close curly. So we set a cookie to monitor the token stream for
	# interlopers. If all goes well, the right curly will find the
	# cookie and know it is supposed to be a quantifier.
	if ( $character eq '{' ) {

	    # If the prior token can not be quantified, all this is
	    # unnecessary.
	    $tokenizer->prior( 'can_be_quantified' )
		or return 1;

	    # We make our token now, before setting the cookie. Otherwise
	    # the cookie has to deal with this token.
	    my $token = $tokenizer->make_token( 1 );

	    # A cookie for the next '}'.
	    my $commas = 0;
	    $tokenizer->cookie( COOKIE_QUANT, sub {
		    my ( $tokenizer, $token ) = @_;
		    $token or return 1;

		    # Of literals, we accept exactly one comma provided it
		    # is not immediately after a '{'. We also accept
		    # anything that matches '\d';
		    if ( $token->isa( TOKEN_LITERAL ) ) {
			my $character = $token->content();
			if ( $character eq ',' ) {
			    $commas++ and return;
			    return $tokenizer->prior( 'content' ) ne '{';
			}
			return $character =~ m/ \A \d \z /smx;
		    }

		    # Since we do not know what is in an interpolation, we
		    # trustingly accept it.
		    if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
		    ) {
			return 1;
		    }

		    return;
		},
	    );

	    return $token;
	}

	# The close curly bracket is a little complicated because if the
	# cookie posted by the left curly bracket is still around, we are a
	# quantifier, otherwise not.
	if ( $character eq '}' ) {
	    $tokenizer->cookie( COOKIE_QUANT, undef )
		or return 1;
	    $tokenizer->prior( 'class' )->isa( __PACKAGE__ )
		and return 1;
	    my $token = $tokenizer->make_token( 1 );
	    $token->{is_quantifier} = 1;
	    return $token;
	}

	# The parse rules are different inside a character class, so we set
	# another cookie. Sigh. If your tool is a hammer ...
	if ( $character eq '[' ) {

	    # Set our cookie. Since it always returns 1, it does not matter
	    # where in the following mess we set it.
	    $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );

	    # Make our token now, since the easiest place to deal with the
	    # beginning-of-character-class strangeness seems to be right
	    # here.
	    my @tokens = $tokenizer->make_token( 1 );

	    # Get the next character, returning tokens if there is none.
	    defined ( $character = $tokenizer->peek() )
		or return @tokens;

	    # If we have a caret, it is a negation operator. Make its token
	    # and fetch the next character, returning if none.
	    if ( $character eq '^' ) {
		push @tokens, $tokenizer->make_token(
		    1, 'PPIx::Regexp::Token::Operator' );
		defined ( $character = $tokenizer->peek() )
		    or return @tokens;
	    }

	    # If we have a close square at this point, it is not the end of
	    # the class, but just a literal. Make its token.
	    $character eq ']'
		and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );

	    # Return all tokens made.
	    return @tokens;
	}
	# per perlop, the metas inside a [] are -]\^$.
	# per perlop, the metas outside a [] are {}[]()^$.|*+?\
	# The difference is that {}[().|*+? are not metas in [], but - is.

	# On encountering our close bracket, we need to delete the cookie.
	if ( $character eq ']' ) {
	    $tokenizer->cookie( COOKIE_CLASS, undef );
	    return 1;
	}

	return 1;
    }

}

# Called by the lexer once it has done its worst to all the tokens.
# Called as a method with no arguments. The return is the number of
# parse failures discovered when finalizing.
sub __PPIX_LEXER__finalize {
    my ( $self ) = @_;
    delete $self->{is_quantifier};
    return 0;
}

1;

__END__

# ex: set textwidth=72 :