Text::Query::ParseAdvanced - Parse AltaVista advanced query syntax


Text-Query documentation Contained in the Text-Query distribution.

Index


Code Index:

NAME

Top

Text::Query::ParseAdvanced - Parse AltaVista advanced query syntax

SYNOPSIS

Top

  use Text::Query;
  my $q=new Text::Query('hello and world',
                        -parse => 'Text::Query::ParseAdvanced',
                        -solve => 'Text::Query::SolveAdvancedString',
                        -build => 'Text::Query::BuildAdvancedString');




DESCRIPTION

Top

This module provides an object that parses a string containing a Boolean query expression similar to an AltaVista "advanced query".

It's base class is Text::Query::Parse;

Query expressions consist of literal strings (or regexps) joined by the following operators, in order of precedence from lowest to highest:

OR, |
AND, &
NEAR, ~
NOT, !

Operator names are not case-sensitive. Note that if you want to use a | in a regexp, you need to backwhack it to keep it from being seen as a query operator. Sub-expressions may be quoted in single or double quotes to match "and," "or," or "not" literally and may be grouped in parentheses ((, )) to alter the precedence of evaluation.

A parenthesized sub-expression may also be concatenated with other sub- expressions to match sequences: (Perl or Python) interpreter would match either "Perl interpreter" or "Python interpreter". Concatenation has a precedence higher than NOT but lower than AND. Juxtaposition of simple words has the highest precedence of all.

OPTIONS

Top

These are the additional options of the prepare method and the constructor.

-near defaults to 10

Sets the number of words that can occur between two expressions and still satisfy the NEAR operator.

-operators defaults to and, or, not, near

Sets the operator names. The argument of the option is a pointer to a hash table mapping the default names to desired names. For instance:

    {
	'or' => 'ou',
	'and' => 'et',
	'near' => 'proche',
	'not' => 'non',
    }

-scope_map default to {}

Map the scope names to other names. If a scope is specified as scope: search the map for an entry whose key is scope and replace scope with the scalar found. For instance:

     {
	 'scope' => 'otherscope'
     }

SEE ALSO

Top

Text::Query(3) Text::Query::Parse(3)

AUTHORS

Top

Eric Bohlman (ebohlman@netcom.com)

Loic Dachary (loic@senga.org)


Text-Query documentation Contained in the Text-Query distribution.

#
#   Copyright (C) 1999 Eric Bohlman, Loic Dachary
#
#   This program is free software; you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation; either version 2, or (at your option) any
#   later version.  You may also use, redistribute and/or modify it
#   under the terms of the Artistic License supplied with your Perl
#   distribution
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
#
# 
# $Header: /usr/local/cvsroot/Text-Query/lib/Text/Query/ParseAdvanced.pm,v 1.2 1999/06/14 12:53:58 loic Exp $
#
package Text::Query::ParseAdvanced;

use strict;

use Carp;
use Text::Query::Parse;

use vars qw(@ISA);

@ISA = qw(Text::Query::Parse);

sub prepare {
    my($self) = shift;
    my($qstring) = shift;
    my(%args) = @_;

    my $default_operators = {
	'or' => 'or',
	'and' => 'and',
	'near' => 'near',
	'not' => 'not',
    };

    $self->{'scope_map'} = $args{-scope_map} || {};

    return $self->SUPER::prepare($qstring, -near=>10, -operators=>$default_operators, @_);
}

sub expression($) {
    my($self) = shift;
    my($rv, $t);
    my($or) = $self->{parseopts}{-operators}{or};
    my($tokens) = $self->{'tokens'};
    $self->{'token'} = shift(@$tokens);
    $rv = $self->conj();
    while(defined($self->{'token'}) and $self->{'token'} =~ /^($or|\|)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$t= $self->conj();
	$rv = $self->build_expression($rv,$t);
    }
    return $self->build_expression_finish($rv); 
}

sub conj($) {
    my($self) = shift;
    my($rv);
    my($first) = 1;
    my($and) = $self->{parseopts}{-operators}{and};
    $rv = $self->concat();
    while(defined($self->{'token'}) and $self->{'token'} =~ /^($and|&)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$rv = $self->build_conj($rv, concat($self), $first);
	$first=0;
    }
    return $rv;
}

sub concat($) {
    my($self) = shift;
    my($rv,$t,$l);
    my($not) = $self->{parseopts}{-operators}{not};
    my($near) = $self->{parseopts}{-operators}{near};
    $rv = factor($self);
    while(defined($self->{'token'}) and ($l = $self->{'token'}) =~ /^\e|([\(!\~]|$not|$near)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}}) if($l =~ /^($near|\~)$/i);
	$t = factor($self);
	if($l =~ /^($near|\~)$/i) {
	    $rv = $self->build_near($rv, $t);
	} else {
	    $rv = $self->build_concat($rv, $t);
	}
    }
    return $rv;
}

sub factor($) {
    my($self) = shift;

    my($rv,$t);
    my($not) = $self->{parseopts}{-operators}{not};
    if(!defined($t = $self->{'token'})) {
	croak("out of token in factor");
    } elsif($t eq '(') {
	$rv = $self->expression();
	if(defined($self->{'token'}) and $self->{'token'} eq ')') {
	    $self->{'token'} = shift(@{$self->{'tokens'}});
	} else {
	    croak("missing closing parenthesis in factor");
	}
    } elsif($t =~ /^($not|!)$/i) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	$rv = $self->build_negation(factor($self));
    } elsif($t =~ s/^\e//) {
	$rv = $self->build_literal($t);
	$self->{'token'} = shift(@{$self->{'tokens'}});
    } elsif($t =~ s/:$//) {
	$self->{'token'} = shift(@{$self->{'tokens'}});
	unshift(@{$self->{'scope'}}, ($self->{'scope_map'}{$t} || $t));
	$self->build_scope_start();
	$rv = $self->build_scope_end(factor($self));
	shift(@{$self->{'scope'}});
    } else {
	croak("unexpected token $t in factor");
    }
    return $rv;
}

sub parse_tokens {
    local($^W) = 0;
    my($self) = shift;
    my($line) = @_;
    my($quote, $quoted, $unquoted, $delim, $word);
    my($quotes) = $self->{parseopts}{-quotes};
    my($operators) = join("|", values(%{$self->{parseopts}{-operators}}));
    my(@tokens) = ();

    warn("quotes = $quotes") if($self->{-verbose} > 1);
    while(length($line)) {
	($quote, $quoted, undef, $unquoted, $delim, undef) =
	    $line =~ m/^([$quotes])                 # a $quote
                                ((?:\\.|(?!\1)[^\\])*)    # and $quoted text
                                \1 		       # followed by the same quote
                                ([\000-\377]*)	       # and the rest
	       	       |                       # --OR--
                                ^((?:\\.|[^\\$quotes])*?)    # an $unquoted text
	        	        (\Z(?!\n)|(?:\s*([()|&!\~]|\b(?:$operators)\b|\b(?:[-,_\.\w]+\:))\s*)|(?!^)(?=[$quotes])) # plus EOL, delimiter, or quote
                                ([\000-\377]*)	       # the rest
	       	       /ix;		       # extended layout

	warn("quote = $quote") if($self->{-verbose} > 1 && $quote);
	last unless($quote || length($unquoted) || length($delim));
	$line = $+;
	$unquoted =~ s/^\s+//;
	$unquoted =~ s/\s+$//;
	$word .= defined($quote) ? $quoted : $unquoted;
	warn("word = $word") if($self->{-verbose} > 1 and (length($word) and (length($delim) or !length($line))));
	push(@tokens,"\e$word") if(length($word) and (length($delim) or !length($line)));
	$delim =~ s/^\s+//;
	$delim =~ s/\s+$//;
	warn("delim = $word") if($self->{-verbose} > 1 and length($delim));
	push(@tokens, $delim) if(length($delim));
	undef $word if(length($delim));
    }

    warn("parsed tokens @tokens") if($self->{-verbose} > 1);

    $self->{'tokens'} = \@tokens;
}

1;

__END__

# Local Variables: ***
# mode: perl ***
# End: ***