Net::Z3950::IndexMARC - Comprehensive but inefficent index for MARC records


Net-Z3950-RadioMARC documentation Contained in the Net-Z3950-RadioMARC distribution.

Index


Code Index:

NAME

Top

Net::Z3950::IndexMARC - Comprehensive but inefficent index for MARC records

SYNOPSIS

Top

 $file = MARC::File::USMARC->in($filename);
 $index = new Net::Z3950::IndexMARC();
 while ($marc = $file->next()) {
     $index->add($marc);
 }
 $index->dump(\*STDOUT);
 $hashref = $index->find('@attr 1=4 dinosaur');
 foreach $i (keys %$hashref) {
    $rec = $index->fetch($i);
    print $rec->as_formatted();
 }

DESCRIPTION

Top

This module provides a comprehensive inverted index across a set of MARC records, allowing simple keyword retrieval down to the level of individual field and subfields. However, it does this by building a big Perl data-structure (hash of hashes of arrays) in memory, and makes no efforts whatsoever towards optimisation. So this is only appropriate for small collections of records.

METHODS

Top

new()

 $index = new Net::Z3950::IndexMARC();

Creates a new IndexMARC object. Takes no parameters, and returns the new object.

add()

 $record = new MARC::Record();
 $record->append_fields(...);
 $index->add($record);

Adds a single MARC record to the specified index. A reference to the record itself is also added, so the record object will not be garbage collected until (at least) the index goes out of scope. The record passed in must be of the type MARC::Record.

An opaque token representing the new record is returned. This may subsequently be passed to fetch() to retrieve the record.

dump()

 $index->dump(\*STDOUT);

Dumps the contents of the specified index to the specified stream in human-readable form. Takes no arguments. Should only be used for debugging.

find()

 $hithash = $index->find("@and fruit fish");

Finds records satisfying the specified PQF query, and returns a reference to a hash consisting of one element for each matching record.

Each key in the returned hash is an opaque token representing a record, which may be fed to fetch() to retrieve the record itself. The corresponding value contains details of the hits in that record. The hit details consist of an array of arbitrary length, one element per occurrence of the searched-for term. Each element of this array is itself an array of three elements: the tag of the field in which the term exists [0], the tag of the subfield [2], and the word-number within the field, starting from word 1 [3].

PQF is Prefix Query Format, as described in the ``Tools'' section of the YAZ manual; however, this module does not perform field-specific searching since to do so would necessarily involve a mapping between Type-1 query access points and MARC fields, which we want to avoid having to assume anything about. Accordingly, use attributes are ignored. Further, at present boolean operations are also refused, and only the single-term queries are supported.

fetch()

 $marc = $index->fetch($token);

Returns the MARC::Record object corresponding to the specified record token, as returned from add() or find().

PROVENANCE

Top

This module is part of the Net::Z3950::RadioMARC distribution. The copyright, authorship and licence are all as for the distribution.


Net-Z3950-RadioMARC documentation Contained in the Net-Z3950-RadioMARC distribution.
# $Id: IndexMARC.pm,v 1.19 2005/04/27 10:41:14 mike Exp $

package Net::Z3950::IndexMARC;

use 5.008;
use strict;
use warnings;

use MARC::Record;
use Net::Z3950::PQF 0.03;


sub new {
    my $class = shift();

    return bless {
	records => [],
	index => {},		# maps queryable terms into records[]
	pqf => undef,		# PQF parser, created on demand
    }, $class;
}


sub add {
    my $this = shift();
    my($marc) = @_;

    my $reccount = @{ $this->{records} };
    push @{ $this->{records} }, $marc;
    my $index = $this->{index};

    foreach my $field ($marc->fields()) {
	my $tag = $field->tag();
	if ($tag < "010") {
	    # Control fields must be handled separately, or ignored
	    next;
	}

	my @subfields = $field->subfields();
	foreach my $ref (@subfields) {
	    my($subtag, $value) = @$ref;

	    ### We might consider a more sophisticated word-parsing scheme
	    my @words = (lc($value)); # the whole field is word zero
	    $value =~ s/^\s+//;
	    push @words, split(/[\s,\.:\/]+/, $value);

	    for (my $pos = 0; $pos < @words; $pos++) {
		my $word = $words[$pos];
		my $indexentry = [ $tag, $subtag, $pos ];

		$word = lc($word); # case-insensitive indexing
		my $wordref = $index->{$word};
		if (!defined $wordref) {
		    # It's the first we've seen this word in any record
		    $index->{$word} = { $reccount => [ $indexentry ] };
		    next;
		}

		my $recref = $wordref->{$reccount};
		if (!defined $recref) {
		    # First time we've seen the word in this record
		    $wordref->{$reccount} = [ $indexentry ];
		    next;
		}

		# Second or subsequent occurrence of word in record
		push @$recref, $indexentry;
	    }
	}
    }

    return $reccount;
}


sub dump {
    my $this = shift();
    my($stream) = @_;

    my $index = $this->{index};
    foreach my $word (sort keys %$index) {
	my $wordref = $index->{$word};
	my $gotWord = 0;
	foreach my $reccount (sort { $a <=> $b } keys %$wordref) {
	    print $stream sprintf("%-30s", $gotWord++ ? "" : "'$word'");
	    my $recref = $wordref->{$reccount};
	    my $gotRec = 0;
	    foreach my $indexentry (@$recref) {
		print $stream sprintf("%-8s",
				      $gotRec++ ? " " x 38 : "rec $reccount");
		my($tag, $subtag, $pos) = @$indexentry;
		print $stream "$tag\$$subtag word $pos\n";
	    }
	}
    }
}


sub find {
    my $this = shift();
    my($pqf) = @_;

    return { 0 => [] } if @{$this->{records}} == 1;

    $this->{pqf} = new Net::Z3950::PQF()
	if !defined $this->{pqf};

    my $parser = $this->{pqf};
    my $node = $parser->parse($pqf);
    ### Should have a nicer way to report this error
    die "Can't parse PQF '$pqf': " . $parser->errmsg()
	if !defined $node;

    return $this->_find($node);
}


sub _find {
    my $this = shift();
    my($node) = @_;

    if ($node->isa("Net::Z3950::PQF::TermNode")) {
	return $this->_find_term($node);
    } if ($node->isa("Net::Z3950::PQF::BooleanNode")) {
	return $this->_find_boolean($node);
    } else {
	die "unsupported node type $node";
    }
}


sub _find_term {
    my $this = shift();
    my($term) = @_;

    ### This is a very clumsy way to handle truncation etc.
    my $rs = {};
    my $index = $this->{index};
    foreach my $key (keys %$index) {
	my $hits = $index->{$key};
	if ($this->_match($term, $key, $hits)) {
	    foreach my $recnum (keys %$hits) {
		push @{ $rs->{$recnum} }, @{ $hits->{$recnum} };
	    }
	}
    }

    return $rs;
}


sub _match {
    my $this = shift();
    my($term, $key, $hits) = @_;

    my($trunc, $comp);
    foreach my $attr (@{ $term->{attrs} }) {
	my($set, $type, $val) = @$attr;
	# In BIB-1, type 5 is truncation and 6 is completeness
	$trunc = $val if $type == 5;
	$comp = $val if $type == 6;
    }

    my $value = lc($term->{value});
    if (defined $comp && ($comp == 2 || $comp == 3)) {
	# Complete subfield or field
	use Data::Dumper;
	#print "*whole-field match against '$value': key='$key', hits=", Dumper($hits);
    }

    my $vlen = length($value);
    if (!defined $trunc || $trunc == 100) {
	# No truncation
	return $value eq $key;
    } elsif ($trunc == 1) {
	# Right truncation
	#print "*testing '$value*' against '$key'\n";
	return $value eq substr($key, 0, $vlen);
    } elsif ($trunc == 2) {
	# Left truncation
	#print "*testing '*$value' against '$key'\n";
	return $value eq substr($key, -$vlen, $vlen);
    } elsif ($trunc == 3) {
	# Left and right truncation ... sigh
	my $klen = length($key);
	#print "*testing '*$value*' against '$key'; vlen=$vlen, klen=$klen\n";
	for (my $i = 0; $i <= $klen-$vlen; $i++) {
	    #print " *comparing '$value' to '", substr($key, $i, $vlen), "'\n";
	    return 1 if $value eq substr($key, $i, $vlen);
	}
	return 0;
    }

    die "unsupported truncation value $trunc";
}


sub _find_boolean {
    my $this = shift();
    my($node) = @_;

    my @subres = map { $this->_find($_) } @{ $node->{sub} };
    my($s1, $s2) = @subres;
    my $final = {};

    if ($node->isa("Net::Z3950::PQF::AndNode")) {
	foreach my $key (keys %$s1) {
	    if (defined $s2->{$key}) {
		$final->{$key} = $this->_merge_info($s1->{$key}, $s2->{$key});
	    }
	}

    } elsif ($node->isa("Net::Z3950::PQF::OrNode")) {
	my %c2 = %$s2;
	foreach my $key (keys %$s1) {
	    if (defined $c2{$key}) {
		$final->{$key} = $this->_merge_info($s1->{$key}, $c2{$key});
		delete $c2{$key};
	    } else {
		$final->{$key} = $s1->{$key};
	    }
	}
	foreach my $key (keys %c2) {
	    $final->{$key} = $c2{$key};
	}

    } elsif ($node->isa("Net::Z3950::PQF::NotNode")) {
	foreach my $key (keys %$s1) {
	    if (!defined $s2->{$key}) {
		$final->{$key} = $s1->{$key};
	    }
	}

    } else {
	die "Unknown boolean node-type: $node";
    }

    return $final;
}


sub _merge_info {
    my $this = shift();
    my($info1, $info2) = @_;

    if (0) {
	use Data::Dumper;
	print("_merge_info: ",
	      "info1=", Dumper($info1),
	      "info2=", Dumper($info2),
	      "\n");
    }

    ### Should do much, much better!
    return 1;
}


sub fetch {
    my $this = shift();
    my($num) = @_;

    my $records = $this->{records};
    my $count = scalar(@$records);
    die "record number $num out of range 0.." . ($count-1)
	if $num < 0 || $num >= $count;
    return $records->[$num];
}



1;