Unicode::Regex::Set - Subtraction and Intersection of Character Sets in Unicode Regular Expressions


Unicode-Regex-Set documentation Contained in the Unicode-Regex-Set distribution.

Index


Code Index:

NAME

Top

Unicode::Regex::Set - Subtraction and Intersection of Character Sets in Unicode Regular Expressions

SYNOPSIS

Top

    use Unicode::Regex::Set qw(parse);

    $regex = parse('[\p{Latin} & \p{L&} - A-Z]');

DESCRIPTION

Top

Perl 5.8.0 misses subtraction and intersection of characters, which is described in Unicode Regular Expressions (UTS #18). This module provides a mimic syntax of character classes including subtraction and intersection, taking advantage of look-ahead assertions.

The syntax provided by this module is considerably incompatible with the standard Perl's regex syntax.

Any whitespace character (that matches /\s/) is allowed between any tokens. Square brackets ('[' and ']') are used for grouping. A literal whitespace and square brackets must be backslashed (escaped with a backslash, '\'). You cannot put literal ']' at the start of a group.

A POSIX-style character class like [:alpha:] is allowed since its '[' is not a literal.

SEPARATORS ('&' for intersection, '|' for union, and '-' for subtraction) should be enclosed with one or more whitespaces. E.g. [A&Z] is a list of 'A', '&', 'Z'. [A-Z] is a character range from 'A' to 'Z'. [A-Z - Z] is a set by removal of [Z] from [A-Z].

Union operator '|' may be omitted. E.g. [A-Z | a-z] is equivalent to [A-Z a-z], and also to [A-Za-z].

Intersection operator '&' has high precedence, so [\p{A} \p{B} & \p{C} \p{D}] is equivalent to [\p{A} | [\p{B} & \p{C}] | \p{D}].

Subtraction operator '-' has low precedence, so [\p{A} \p{B} - \p{C} \p{D}] is equivalent to [[\p{A} | \p{B}] - [\p{C} | \p{D}] ].

[\p{A} - \p{B} - \p{C}] is a set by removal of \p{B} and \p{C} from \p{A}. It is equivalent to [\p{A} - [\p{B} \p{C}]] and [\p{A} - \p{B} \p{C}].

Negation. when '^' just after a group-opening '[', i.e. when they are combined as '[^', all the tokens following are negated. E.g. [^A-Z a-z] matches anything but neither [A-Z] nor [a-z]. More clearly you can say this with grouping as [^ [A-Z a-z]].

If '^' that is not next to '[' is prefixed to a sequence of literal characters, character ranges, and/or metacharacters, such a '^' only negates that sequence; e.g. [A-Z ^\p{Latin}] matches A-Z or a non-Latin character. But [A-Z [^\p{Latin}]] (or [A-Z \P{Latin}], for this is a simple case) is recommended for clarity.

If you want to remove anything other than PERL from [A-Z], use [A-Z & PERL] as well as [A-Z - [^PERL]]. Similarly, if you want to intersect [A-Z] and a thing not JUNK, use [A-Z - JUNK] as well as [A-Z & [^JUNK]].

For further examples, please see tests.

FUNCTION

Top

$perl_regex = parse($unicode_character_class)

parses a Character Class pattern according to Unicode Regular Expressions and converts it into a regular expression in Perl (returned as a string).

AUTHOR

Top

SADAHIRO Tomoyuki <SADAHIRO@cpan.org>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2003-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This library is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

SEE ALSO

Top

http://www.unicode.org/unicode/reports/tr18/

Unicode Regular Expression Guidelines - UTR #18 (to be Unicode Regular Expressions - UTS #18)


Unicode-Regex-Set documentation Contained in the Unicode-Regex-Set distribution.

package Unicode::Regex::Set;

require 5.008;

use strict;
use warnings;
use Carp;
use vars qw($VERSION $PACKAGE @ISA @EXPORT @EXPORT_OK);

require Exporter;
@ISA = qw(Exporter);

@EXPORT_OK = qw(parse maketree tostring);
@EXPORT    = ();

$VERSION = '0.02';
$PACKAGE = __PACKAGE__;

use constant TRUE  => 1;
use constant FALSE => '';

my %Meaning = (
    '[' => 'group beginning',
    ']' => 'group end',
    '&' => 'intersection',
    '|' => 'union',
    ''  => 'union',
    '-' => 'subtraction',
);

#  Token combination table:  e.g  '[' followed by '&' is NG.
#
#	1\2   '['  ']'  '&'  '|'  '-'  Lit
#	'['   OK   NG   NG   NG   NG   OK
#	']'   OK   OK   OK   OK   OK   OK
#	'&'   OK   NG   NG   NG   NG   OK
#	'|'   OK   NG   NG   NG   NG   OK
#	'-'   OK   NG   NG   NG   NG   OK
#	Lit   OK   OK   OK   OK   OK   OK
#
#  Lit, literal, includes A-Z, \[, \|, \-, '\ ' (escaped space), \n, \r,
#       \t, \f, \cA, \ooo, \xhh, \x{hhhh}, \p{Prop}, \N{NAME}, [:posix:].
#       They are retained as they are.
# [=oops=] are not considered.

sub parse { tostring(maketree(@_)) }

#   $node = {
#	parent  => $node_or_undef, # undef for root
#	neg     => $boolean, # true if group begins with '[^'
#	follow  => $boolean, # true if requires literal
#	op      => $char,    # '&', '-', '|'
#	childs  => $arrayref_of_nodes,
#    }

sub maketree {
    my $cur;
    my $arg = shift;

    foreach (ref $arg ? $$arg : $arg) # store in $_
    {
	if (!s/^\[//) {
	    croak "a character class not beginning at [";
	}
	$cur = { parent => undef, op => FALSE, childs => [] };
	s/^\^// and $cur->{neg} = TRUE;

	while (1) {

	    # skip whitespaces
	    if (s/^\s+//) {
		next;
	    }

	    # beginning of a group
	    if (s/^\[  (?! \: [^\[\]\:]+ \:\] )//x) {
		if ($cur->{op} eq '&' && !$cur->{follow}) {
		    $cur = $cur->{parent};
		}

		push @{ $cur->{childs} },
			+{ parent => $cur, op => FALSE, childs => [] };

		$cur = $cur->{childs}->[-1];
		s/^\^// and $cur->{neg} = TRUE;
		next;
	    }

	    # end of a group
	    if (s/^\]//) {
		if (! $cur->{childs} || ! @{ $cur->{childs} }) {
		    croak "empty (sub)group in a character class";
		}

		if ($cur->{op} eq '&' && !$cur->{follow}) {
		    $cur = $cur->{parent};
		}

	    # LAST:
		last if ! $cur->{parent};

		if ($cur->{follow}) {
		    my $op = $cur->{op};
		    croak "no operand after '$op' ($Meaning{$op})";
		}

		$cur = $cur->{parent};

		$cur->{follow} and $cur->{follow} = FALSE;
		next;
	    }

	    # operators
	    if (s/^([\&\|\-])(?=[\s\[\]])//) {
		my $o = $1;

		if (! $cur->{childs} || ! @{ $cur->{childs} }) {
		    croak "no operand before '$o' ($Meaning{$o})";
		}

		if ($cur->{follow}) {
		    my $p = $cur->{op};
		    croak "no operand between '$p' ($Meaning{$p}) "
			. "and '$o' ($Meaning{$o})";
		}

		if ($cur->{op} eq $o)
		{
		    $cur->{follow} = TRUE;
		    next;
		}

		if ($cur->{op} eq '&' && !$cur->{follow})
		    # in this case $op must not be '&' (see the prev block)
		    # '&' has high precedence: [A & B - C] as [[A & B] - C]
		{
		    $cur = $cur->{parent};
		}

		if ($o eq '&')
		    # '&' has high precedence: [A B & C D] as [A [B & C] D]
		{
		    my $last = pop @{ $cur->{childs} };

		    push @{ $cur->{childs} },
			{ parent => $cur, op => $o, childs => [ $last ] };

		    $cur = $cur->{childs}->[-1];
		    $cur->{follow} = TRUE;
		    next;
		}

		if ($o eq '-') {
		    if (@{ $cur->{childs} } > 1)
			# '-' has low precedence: [A B - C] as [[A B] - C]
		    {
			my @kids = @{ $cur->{childs} };
			@{ $cur->{childs} } =
			    { parent => $cur, op => FALSE, childs => \@kids };
		    }

		    $cur->{op} = $o;
		    next;
		}

		if ($o eq '|') { # simple union
		    $cur->{op} = $o;
		    next;
		}
	    }


	    if (s/^((?:
		    		    \\[pPN]\{ [^{}]* \}
		  		  | \\c?(?s:.)
		  		  | [^\s\[\]]
		  		  | \[\: [^\[\]\:]+ \:\]
						)+)//x)
	    {
		my $lit = $1;

		if ($lit eq '^') {
		    croak "A bare '^', that has nothing to be negated.";
		}

		if ($cur->{op} eq '&' && !$cur->{follow})
		    # '&' has high precedence: [A & B C] as [[A & B] C]
		{
		    $cur = $cur->{parent};
		}

		$cur->{follow} and $cur->{follow} = FALSE;
		my $kid = $cur->{childs};

		if (@$kid
		    && ! ref($kid->[-1])
		    && $lit	  !~ /^[\-\^]/
		    && $kid->[-1] !~ /^\[\^/
		    && $kid->[-1] !~ /\-\]\z/
		    && $cur->{op} ne '&'
		    && !($cur->{op} eq '-' && @$kid == 1))
		# this is only simplification, so avoids uncertain cases
		{
		    substr($kid->[-1], -1, 0, $lit);
		}
		else {
		    push @$kid, "[$lit]";
		}
		next;
	    }

	    croak "panic or incomplete character class (missing last ']'?);";
	}
    }

    return $cur;
}

sub tostring {
    my $list = shift;

    for (@{ $list->{childs} }) {
	next  if !ref($_);
	croak "panic" if ref($_) ne 'HASH';
	$_ = tostring($_); # recursive
    }
    my $ret;
    my $op   = $list->{op} || FALSE;
    my $kids = $list->{childs};

    if ($op eq '&') {
	my $base = shift @$kids;
	my $pre  = join '', map "(?=$_)", @$kids;
	$ret = "(?:$pre$base)";
    }
    elsif ($op eq '-') {
	my $base = shift @$kids;
	my $pre  = join('|', @$kids);
	$ret = "(?:(?!$pre)$base)";
    }
    else {
	$ret = @$kids > 1 ? "(?:".join('|', @$kids).")" : $kids->[0];
    }
    return $list->{neg} ? "(?:(?!$ret)(?s:.))" : $ret;
}

1;
__END__