KSx::Search::RegexpTermQuery - Regular expression term query class for KinoSearch


KSx-Search-WildCardQuery documentation Contained in the KSx-Search-WildCardQuery distribution.

Index


Code Index:

NAME

Top

KSx::Search::RegexpTermQuery - Regular expression term query class for KinoSearch

VERSION

Top

0.05

SYNOPSIS

Top

    use KSx::Search::RegexpTermQuery
    my $query = new KSx::Search::RegexpTermQuery
        regexp => qr/^foo/,
        field  => 'content',
    ;

    $searcher->hits($query);
    # etc.

DESCRIPTION

Top

This module provides search query objects for KinoSearch that find terms that match a particular regular expression. Note that a query will only match a single term; it is not a regexp match against an entire field.

PERFORMANCE

Top

If a regular expression has a fixed literal prefix anchored to the beginning of the string (e.g., the foo in qr/^foo[dl]$/), only the 'foo' words in the index will be scanned, so this should not be too slow, as long as the prefix is fairly long, or there are sufficiently few 'foo' words. If, however, there is no literal prefix (e.g., qr/foo/), the entire index will be scanned, so beware.

METHODS

Top

new

This is the constructor. It constructs. Call it with hash-style arguments as shown in the SYNOPSIS. The regexp can be a qr// thingy or a string.

PREREQUISITES

Top

Hash::Util::FieldHash::Compat

The development version of KinoSearch available at http://www.rectangular.com/svn/kinosearch/trunk, revision 4810 or higher.

AUTHOR & COPYRIGHT

Top

ACKNOWLEDGEMENTS

Top

Much of the code in this module was plagiarized from Marvin Humphrey's KinoSearch modules.

SEE ALSO

Top

KinoSearch, KinoSearch::Search::Query, KSx::Search::WildCardQuery


KSx-Search-WildCardQuery documentation Contained in the KSx-Search-WildCardQuery distribution.

use strict;
use warnings;

package KSx::Search::RegexpTermQuery;
use base qw( KinoSearch::Search::Query );

our $VERSION = '0.05';

use Hash::Util::FieldHash::Compat 'fieldhashes';
fieldhashes \my( %re, %prefix, %field );

sub new {
    my ($package, %args) = @_;

    my $re = delete $args{regexp};
    my $field = delete $args{field};

    my $self = $package->SUPER::new(%args);

    $re{$self} = $re;
    $field{$self} = $field;

    # get the literal prefix of the regexp, if any.
    if($re{$self} =~
        m<^
                        (?:    # prefix for qr//'s, without allowing /i :
                                \(\? ([a-hj-z]*) (?:-[a-z]*)?:
                        )?
                        (\\[GA]|\^) # anchor
                        ([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments)
                >x
    ) {{
        my ($mod,$anchor,$prefix) = ($1||'',$2,$3);
	$anchor eq '^' and $mod =~ /m/ and last;
	for($prefix) {
            $mod =~ /x/ and s/\s+//g;
        }
        $prefix{$self} = $prefix;
    }}

    $self;
}

#sub extract_terms {
#    my $self = shift;
#    return @{ $self->{terms} };
#}

sub make_compiler {
    return KSx::Search::RegexpTermCompiler->new(
        parent => @_
    );
}


package KSx::Search::RegexpTermCompiler;
use base qw( KinoSearch::Search::Compiler );

use Hash::Util::FieldHash::Compat 'fieldhashes';

fieldhashes \my ( %idf, %raw_impact, #%plists,
                  %terms,
                  %query_norm_factor, % normalized_impact, %tfs );
sub new {
    my($pack, %args) = @_;

    my $searcher = $args{searchable};
    my $reader      = $searcher->get_reader;
    my $lex_reader  = $reader->fetch("KinoSearch::Index::LexiconReader");
    my $post_reader = $reader->fetch("KinoSearch::Index::PostingsReader");

    # Retrieve the correct Similarity for the Query's field.
    my $sim = $args{similarity} = 
        $searcher->get_schema->fetch_sim($field{$args{parent}});

    my $self = $pack->SUPER::new(%args);

    my $parent = $args{parent};

    # Get a lexicon and find our place therein
    my( $re, $prefix ) = ($re{$parent}, $prefix{$parent});
    ref $re eq 'Regexp' or $re = qr/$re/; # avoid repetitive recompilation
    my $lexcn = $lex_reader->lexicon( field => $field{$parent} );
    $lexcn->seek(defined $prefix ? $prefix : '');
    
    # iterate through it, stopping at terms that match
    my @terms; #my @plists;
    my %hits; # The keys are the doc nums; the values the tfs.

    while () {
        my $term = get_term $lexcn;
        
        # sift out unwanted terms
        last if defined $prefix and index( $term, $prefix ) != 0;
        next unless $term =~ $re;

        # for terms that match...

        push @terms, $term;

        # We have to iterate through the documents in each posting list,
        # recording the doc numbers, so we can calc the doc freq later on.
        # E.g., if there are two documents, one containing ‘dog’ and ‘dot,’
        # and the other containing just ‘dog,’ and the re is /^do.*/, then
        # the doc freq has to be 2, since the re matches two docs. The doc
        # freqs of the individual terms are 1 and 2, so we can’t add or
        # average them.
        my $plist = $post_reader->posting_list(
	                              term => $term,
	                              field => $field{$parent},
	                          );
	my $posting; my $weight;
        while (my $doc_num = $plist ->next) {
            # For efficiency’s sake, we’ll collect the results now, to
            # avoid iterating through postings (the slowest part of search-
            # ing) more than once, even though this code probably belongs
            # in RegexpTermScorer
            my $posting ||= $plist->get_posting;
            $hits{$doc_num} +=
             $weight ||= $posting->get_freq * $posting->get_weight
        }

    } continue {
        last unless  $lexcn->next ;
    }
    my $doc_freq = scalar keys %hits;

    # Save the hits and terms for later
#    $plists{$self} = \@plists;
    $tfs{$self} = \%hits;
    $terms{$self} = \@terms;

    # Calculate and store the IDF
    my $max_doc = $searcher->doc_max;
    my $idf = $idf{$self} = $max_doc
    ?    1 + log( $max_doc / ( 1 + $doc_freq ) )
    :    1
    ;

    $raw_impact{$self} = $idf * $parent->get_boost;

    # make final preparations
    $self->perform_query_normalization($searcher);

    $self;
}

sub perform_query_normalization {
# copied from KinoSearch::Search::Weight originally
    my ( $self, $searcher ) = @_;
    my $sim = $self->get_similarity;

    my $factor = $self->sum_of_squared_weights;    # factor = ( tf_q * idf_t )
    $factor = $sim->query_norm($factor);           # factor /= norm_q
    $self->normalize($factor);                     # impact *= factor
}

sub get_value { shift->get_parent->get_boost }

sub sum_of_squared_weights { $raw_impact{+shift}**2 }

sub normalize { # copied from TermQuery
    my ( $self, $query_norm_factor ) = @_;
    $query_norm_factor{$self} = $query_norm_factor;

    # Multiply raw impact by ( tf_q * idf_q / norm_q )
    #
    # Note: factoring in IDF a second time is correct.  See formula.
    $normalized_impact{$self}
        = $raw_impact{$self} * $idf{$self} * $query_norm_factor;
}

sub make_matcher {
    my $self = shift;

    return KSx::Search::RegexpTermScorer->new(
#        posting_lists => $plists{$self},
        @_,
        compiler      => $self,
    );
}

sub highlight_spans {  # plagiarised form of TermWeight’s routine
    my ($self, %args) = @_;
    my $doc_vector = $args{doc_vec};
    my $field_name = $args{field};
    return if $field{$self->get_parent} ne $field_name;
    my $searcher   = $args{searcher};
    my $terms      = $terms{$self};

    require KinoSearch::Search::Span;

    my @posits;
    my $weight_val = $self->get_value;
    for (@$terms) {
        my $term_vector
            = $doc_vector->term_vector( field => $field_name, term => $_ );
        next unless defined $term_vector;
        my $starts = $term_vector->get_start_offsets->to_arrayref;
        my $ends   = $term_vector->get_end_offsets->to_arrayref;
        while (@$starts) {
            my $start = shift @$starts;
            push @posits, KinoSearch::Search::Span->new(
                offset => $start,
                length   => shift(@$ends)-$start, 
                weight       => $weight_val,
            );
        }
    }

    return \@posits;
}


package KSx::Search::RegexpTermScorer;
use base 'KinoSearch::Search::Matcher';

use Hash::Util::FieldHash::Compat 'fieldhashes';
fieldhashes\my(  %doc_nums, %pos, %wv,  %sim, %compiler );

sub new {
	my ($class, %args) = @_;
#	my $plists = delete $args{posting_lists};
	my $compiler   = delete $args{compiler};
	my $reader     = delete $args{reader};
	my $need_score = delete $args{need_score};
	my $self   = $class->SUPER::new(%args);
	$sim{$self} = $compiler->get_similarity;

	my $tfs = $tfs{$compiler};
	$doc_nums{$self} = [ sort { $a <=> $b } keys %$tfs ];
	
	$pos{$self} = -1;
	$wv {$self} = $compiler->get_value;
	$compiler{$self} = $compiler;
	
	$self
}

sub next {
	my $self = shift;
	my $doc_nums = $doc_nums{$self};
	return 0 if $pos{$self} >= $#$doc_nums;
	return $$doc_nums[ ++$pos{$self} ];
}

sub get_doc_num {
	my $self = shift;
	my $pos = $pos{$self};
	my $doc_nums = $doc_nums{$self};
	return $pos < scalar @$doc_nums ? $$doc_nums[$pos] : 0;
}

sub score {
	my $self = shift;
	my $pos = $pos{$self};
	my $doc_nums = $doc_nums{$self};
	return $wv{$self} * $sim{$self}->tf(
	  $tfs{$compiler{$self}}{$$doc_nums[$pos]}
	);
}


1;

__END__