Regexp::Optimizer - optimizes regular expressions


Regexp-Optimizer documentation Contained in the Regexp-Optimizer distribution.

Index


Code Index:

NAME

Top

Regexp::Optimizer - optimizes regular expressions

SYNOPSIS

Top

  use Regexp::Optimizer;
  my $o  = Regexp::Optimizer->new;
  my $re = $o->optimize(qr/foobar|fooxar|foozap/);
  # $re is now qr/foo(?:[bx]ar|zap)/

ABSTRACT

Top

This module does, ahem, attempts to, optimize regular expressions.

INSTALLATION

Top

To install this module type the following:

   perl Makefile.PL
   make
   make test
   make install

DESCRIPTION

Top

Here is a quote from perltodo.

Factoring out common suffices/prefices in regexps (trie optimization)

Currently, the user has to optimize "foo|far" and "foo|goo" into "f(?:oo|ar)" and "[fg]oo" by hand; this could be done automatically.

This module implements just that.

EXPORT

Since this is an OO module there is no symbol exported.

METHODS

Top

This module is implemented as a subclass of Regexp::List. For methods not listed here, see Regexp::List.

$o = Regexp::Optimizer->new;
$o->set(key => value, ...)

Just the same us Regexp::List except for the attribute below;

unexpand

When set to one, $o->optimize() tries to $o->expand before actually starting the operation.

  # cases you need to set expand => 1
  $o->set(expand => 1)->optimize(qr/
                                   foobar|
                                   fooxar|
                                   foozar
                                   /x);

$re = $o->optimize(regexp);

Does the job. Note that unlike ->list2re() in Regexp::List, the argument is the regular expression itself. What it basically does is to find groups will alterations and replace it with the result of $o->list2re.

$re = $o->list2re(list of words ...)

Same as list2re() in Regexp::List in terms of functionality but how it tokenize "atoms" is different since the arguments can be regular expressions, not just strings. Here is a brief example.

  my @expr = qw/foobar fooba+/;
  Regexp::List->new->list2re(@expr) eq qr/fooba[\+r]/;
  Regexp::Optimizer->new->list2re(@expr) eq qr/foob(?:a+ar)/;

CAVEATS

Top

This module is still experimental. Do not assume that the result is the same as the unoptimized version.

BUGS

Top

PRACTICALITY

Top

Though this module is still experimental, It is still good enough even for such deeply nested regexes as the followng.

  # See 3.2.2 of  http://www.ietf.org/rfc/rfc2616.txt
  # BNF faithfully turned into a regex
  http://(?:(?:(?:(?:(?:[a-z]|[A-Z])|[0-9])|(?:(?:[a-z]|[A-Z])|[0-9])(?:(?:(?:[a-z]|[A-Z])|[0-9])|-)*(?:(?:[a-z]|[A-Z])|[0-9]))\.)*(?:(?:[a-z]|[A-Z])|(?:[a-z]|[A-Z])(?:(?:(?:[a-z]|[A-Z])|[0-9])|-)*(?:(?:[a-z]|[A-Z])|[0-9]))\.?|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)(?::[0-9]*)?(?:/(?:(?:(?:(?:[a-z]|[A-Z])|[0-9])|[\-\_\.\!\~\*\'\(\)])|%(?:[0-9]|[A-Fa-f])(?:[0-9]|[A-Fa-f])|[:@&=+$,])*(?:;(?:(?:(?:(?:[a-z]|[A-Z])|[0-9])|[\-\_\.\!\~\*\'\(\)])|%(?:[0-9]|[A-Fa-f])(?:[0-9]|[A-Fa-f])|[:@&=+$,])*)*(?:/(?:(?:(?:(?:[a-z]|[A-Z])|[0-9])|[\-\_\.\!\~\*\'\(\)])|%(?:[0-9]|[A-Fa-f])(?:[0-9]|[A-Fa-f])|[:@&=+$,])*(?:;(?:(?:(?:(?:[a-z]|[A-Z])|[0-9])|[\-\_\.\!\~\*\'\(\)])|%(?:[0-9]|[A-Fa-f])(?:[0-9]|[A-Fa-f])|[:@&=+$,])*)*)*(?:\\?(?:[;/?:@&=+$,]|(?:(?:(?:[a-z]|[A-Z])|[0-9])|[\-\_\.\!\~\*\'\(\)])|%(?:[0-9]|[A-Fa-f])(?:[0-9]|[A-Fa-f]))*)?)?

  # and optimized
  http://(?::?[a-zA-Z0-9](?:[a-zA-Z0-9\-]*[a-zA-Z0-9])?\.[a-zA-Z]*(?:[a-zA-Z0-9\-]*[a-zA-Z0-9])?\.?|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)(?::[0-9]*)?(?:/(?:(?:(?:[a-zA-Z0-9\-\_\.\!\~\*\'\x28\x29]|%[0-9A-Fa-f][0-9A-Fa-f])|[:@&=+$,]))*(?:;(?:(?:(?:[a-zA-Z0-9\-\_\.\!\~\*\'\x28\x29]|%[0-9A-Fa-f][0-9A-Fa-f])|[:@&=+$,]))*)*(?:/(?:(?:(?:[a-zA-Z0-9\-\_\.\!\~\*\'\x28\x29]|%[0-9A-Fa-f][0-9A-Fa-f])|[:@&=+$,]))*(?:;(?:(?:(?:[a-zA-Z0-9\-\_\.\!\~\*\'\x28\x29]|%[0-9A-Fa-f][0-9A-Fa-f])|[:@&=+$,]))*)*)*(?:\\?(?:(?:[;/?:@&=+$,a-zA-Z0-9\-\_\.\!\~\*\'\x28\x29]|%[0-9A-Fa-f][0-9A-Fa-f]))*)?)?

By carefully examine both you can find that character classes are properly aggregated.

SEE ALSO

Top

Regexp::List -- upon which this module is based

eg/ directory in this package contains example scripts.

Perl standard documents
 L<perltodo>, L<perlre>

CPAN Modules

Regexp::Presuf, Text::Trie

Books

Mastering Regular Expressions http://www.oreilly.com/catalog/regex2/

AUTHOR

Top

Dan Kogai <dankogai@dan.co.jp>

COPYRIGHT AND LICENSE

Top


Regexp-Optimizer documentation Contained in the Regexp-Optimizer distribution.

#
# $Id: Optimizer.pm,v 0.15 2004/12/05 16:07:34 dankogai Exp dankogai $
#
package Regexp::Optimizer;
use 5.006; # qr/(??{}/ needed
use strict;
use warnings;
use base qw/Regexp::List/;
our $VERSION = do { my @r = (q$Revision: 0.15 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

#our @EXPORT = qw();
#our %EXPORT_TAGS = ( 'all' => [ qw() ] );
#our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
#our $DEBUG     = 0;

# see perldoc perlop

# perldoc perlop on perl 5.8.4 or later
#
#  Pragmata are now correctly propagated into (?{...}) constructions in
#  regexps.  Code such as
#
#    my $x = qr{ ... (??{ $x }) ... };
#
#   will now (correctly) fail under use strict. (As the inner $x is 
#   and has always referred to $::x)

our $RE_PAREN; # predeclear
$RE_PAREN = 
    qr{
              \(
              (?:
		(?> [^()]+ )
		|
		(??{ $RE_PAREN })
              )*
              \)
            }xo;
our $RE_EXPR; # predeclear
$RE_EXPR = 
    qr{
              \{
              (?:
		(?> [^{}]+ )
		|
		(??{ $RE_EXPR })
              )*
              \}
            }xo;
our $RE_PIPE = qr/(?!\\)\|/o;
our $RE_CHAR = 
    qr{(?:
		# single character...
		(?!\\)[^\\\[(|)\]]       | # raw character except '[(|)]'
		$Regexp::List::RE_XCHAR  | # extended characters
              )}xo;
our $RE_CCLASS = 
    qr{(?:
		(?!\\)\[ $RE_CHAR+? \] |
		$Regexp::List::RE_XCHAR      | # extended characters
		(?!\\)[^(|)]                 | # raw character except '[(|)]'
		# Note pseudo-characters are not included
        )}xo;
our $RE_QUANT =
    qr{(?:
		(?!\\)
	    	    (?:
	     	     \? |
	     	     \+ |
	     	     \* |
	     	     \{[\d,]+\}
	     	     )\??
		)}xo;
our $RE_TOKEN = 
    qr{(?:
		(?:
		\\[ULQ] (?:$RE_CHAR+)(?:\\E|$) | # [ul]c or quotemeta
                $Regexp::List::RE_PCHAR  | # pseudo-characters
                $RE_CCLASS |
		$RE_CHAR     
              )
	 	 $RE_QUANT?
              )}xo;
our $RE_START = $Regexp::List::RE_START;

our %PARAM = (meta      => 1,
	      quotemeta => 0,
	      lookahead => 0,
	      optim_cc  => 1,
	      modifiers => '',
	      _char     => $RE_CHAR,
	      _token    => $RE_TOKEN,
	      _cclass   => $RE_CCLASS,
	     );

sub new{
    my $class = ref $_[0] ? ref shift : shift;
    my $self = $class->SUPER::new;
    $self->set(%PARAM, @_);
    $self;
}

sub list2re{
    shift->SUPER::list2re(map {_strip($_)} @_);
}

sub optimize{
    my $self = shift;
    my $str  = shift;
    $self->{unexpand} and $str = $self->unexpand($str);
    # safetey feature against qq/(?:foo)(?:bar)/
    !ref $str and $str =~ /^$RE_START/ and $str = qr/$str/;
    my $re = $self->_optimize($str);
    qr/$re/;
}

sub _strip{
    my ($str, $force) = @_;
    $force or ref $str eq 'Regexp' or return $str;
    $str =~ s/^($RE_START)//o or return $str;
    my $regopt = $1;  $str =~ s/\)$//o;
    $regopt =~ s/^\(\??//o; 
    $regopt =~ /^[-:]/ and $regopt = undef;
    ($str, $regopt);
}

my %my_l2r_opts = 
    (
     as_string => 1, 
     debug     => 0,
     _token    => qr/$RE_PAREN$RE_QUANT?|$RE_PIPE|$RE_TOKEN/,
    );

sub _optimize{
    no warnings 'uninitialized';
    my $self = shift;
    $self->{debug} and $self->{_indent}++;
    $self->{debug} and
	print STDERR '>'x $self->{_indent}, " ", $_[0], "\n";
    my ($result, $regopt)  = _strip(shift, 1);
    $result =~ s/\\([()])/"\\x" . sprintf("%X", ord($1))/ego;
    # $result =~ s/(\s)/"\\x" . sprintf("%X", ord($1))/ego;
    $result !~ /$RE_PIPE/ and goto RESULT;
    my $l = $self->clone->set(%my_l2r_opts);
    # optimize
    unless ($result =~ /$RE_PAREN/){
        my @words = split /$RE_PIPE/ => $result;
        $result = $l->list2re(@words);
	goto RESULT;
    }
    my (@term, $sp);
    while ($result){
	if ($result =~ s/^($RE_PAREN)($RE_QUANT?)//){
	    my ($term, $quant) = ($1, $2);
	    $term = $self->_optimize($term);
	    $l->{optim_cc} = $quant ? 0 : 1;
	    if ($quant){
		if ($term =~ /^$self->{_cclass}$/){
		    $term .= $quant;
		}else{
		    $term = $self->{po} . $term . $self->{pc} . $quant;
		}
	    }
	    $term[$sp] .= $term;
	}elsif($result =~ s/^$RE_PIPE//){
	    $sp += 2;
	    push @term, '|';
	}elsif($result =~ s/^($RE_TOKEN+)//){
	    # warn $1;
	    $term[$sp] .= $1;
	}else{
	    die "something is wrong !";
	}
    }
    # warn scalar @term , ";", join(";" => @term);
    # sleep 1;
    my @stack;
    while (my $term = shift @term){
	if ($term eq '|'){
	    push @stack, $l->list2re(pop @stack, shift @term);
	}else{
	    push @stack, $term;
	}
    }
    $result = join('' => @stack);
 RESULT:
    $result =  $regopt ? qq/(?$regopt$result)/ : $result;
    # warn qq($result, $regopt);
    $self->{debug} and 
	print STDERR '<'x $self->{_indent}, " ", $result, "\n";
    $self->{debug} and $self->{_indent}--;
    $result;
}

sub _pair2re{
    my $self = shift;
    $_[0] eq $_[1] and return $_[0];
    my ($first, $second) =
	length $_[0] <= length $_[1] ? @_ : ($_[1], $_[0]);
    my $l = length($first);
    $l -= 1
	while $self->_head($first, $l) ne $self->_head($second, $l);
    $l > 0 or return join("", @_);
    return $self->_head($first, $l) . 
	$self->{po} . 
	$self->_tail($first, $l) . '|' . $self->_tail($second, $l) .
	$self->{pc};
}

1;
__END__

# Below is stub documentation for your module. You'd better edit it!