XML::Tape::Index - a XMLtape indexer


XML-Tape documentation Contained in the XML-Tape distribution.

Index


Code Index:

NAME

Top

XML::Tape::Index - a XMLtape indexer

SYNOPSIS

Top

 use XML::Tape::Index qw(:all);

 unless (indexexists('ex/tape.xml')) {
     $x = indexopen('ex/tape.xml', 'w');
     $x->reindex;
     $x->indexclose();
 }

 $x = indexopen('ex/tape.xml', 'r');

 for (my $rec = $x->list_identifiers();
      defined($rec);
      $rec = $x->list_identifiers($rec->{token})) {
     print "id     : %s\n" , $rec->{identifier};
     print "date   : %s\n" , $rec->{date};
     print "start  : %s\n" , $rec->{start};
     print "length : %s\n" , $rec->{len};
 }

 my $rec = $x->get_identifier('oai:arXiv.org:hep-th:0208183');
 my $xml = $x->get_record('oai:arXiv.org:hep-th:0208183');

DESCRIPTION

Top

This modules creates an index on XMLtapes to enable fast retrieval of XML documents from the archive. The index files are stored next to the XMLtape.

METHODS

Top

$x = indexopen($tape_file, $flag)

This function opens an index for reading or writing. The parameter tape_file is the location of a XMLtape archive. The flag is "w" when creating a new index or "r" when reading an index. An XML::Tape::Index instance will be returned on success or undef on failure.

$x->reindex()

This method reads the XMLtape extracts all identifier and datestamps from it and stores the byte positions of all records in the index.

$x->list_identifiers([$token])
$x->list_identifiers($from,$until)

Use this method to iterate through the index to return all records. This method returns an index record on success or undef when no more records are available. Each index record is a HASH reference containing the fields 'identifier', 'date', 'start' (the starting byte of the XML document in the XMLtape), 'len' (the length of the XML document in the XMLtape) and 'token'. The 'token' field should be used to return the next index record. One can filter the returned indexed records by using two arguments at the first list_identifiers method invocation. Only index records with dates greater or equal than 'from' and less than 'until' will be returned by subsequent list_identifier requests. E.g.

 # Return all index records...
 for (my $r = $x->list_identifiers(); 
      defined($r);
      $r = $x->list_identifiers($r->{token}) {
 }

 # Return all index records with dates between 2000-01-01 and 2005-12-31...
 for (my $r = $x->list_identifiers(
             '2001-01-01T00:00:00Z',
             '2005-12-31T23:59:59Z'
                    );
      defined($r);
      $r = $x->list_identifiers($r->{token}) {
 }

$x->get_earlist_date()

This methods returns earliest date in the index file

$x->get_tape_file()

This methods returns name of the tape file associated with this index.

$x->get_num_of_records()

This methods returns the number of record in an index.

$x->get_identifier($identifier)

This method returns an index record given an identifier as argument. When no matching index record can be found undef will be returned. The index record is a HASH reference containing the fields 'identifier', 'date', 'start' and 'len' (see above).

$x->get_record($identifier)

This method returns an XML document from the XMLtape given an identifier as argument. When no matching record can be found undef will be returned.

$x->indexclose();

Closes the XMLtape index.

indexexists($tape_file)

This class method returns true when an index on the XMLtape with location $tape_file exists, returns false otherwise.

indexdrop($tape_file)

This class method deletes the index associated with the XMLtape with location $tape_file.

BUGS

Top

 The XML::Tape::Index doesn't lock XMLtape before writing. It is possible to
 overwrite and index while another process is reading it.

CREDITS

Top

XMLtape archives were developed by the Digital Library Research & Prototyping team at Los Alamos National Laboratory.

SEE ALSO

Top

XML::Tape

AUTHOR

Top

Patrick Hochstenbach <Patrick.Hochstenbach@UGent.be>


XML-Tape documentation Contained in the XML-Tape distribution.
#
# $Id: Index.pm,v 1.5 2005/09/01 08:19:27 patrick Exp $
#

package XML::Tape::Index;
use strict;
use DB_File;
use XML::Tape;
use Digest::MD5 qw(md5);
require Exporter;
use vars qw($VERSION);

( $VERSION ) = '$Revision: 1.5 $ ' =~ /\$Revision:\s+([^\s]+)/;;

@XML::Tape::Index::ISA = qw(Exporter);
@XML::Tape::Index::EXPORT_OK = qw(indexopen indexexists indexdrop);
%XML::Tape::Index::EXPORT_TAGS = (all => [qw(indexopen indexexists indexdrop)]);
$XML::Tape::Index::VERBOSE = 0;
$XML::Tape::Index::CACHE_SIZE = 4 * 1024 * 1024;

sub _get_index {
    my ($filename) = @_;
    return {
        adm_index_file => "$filename.adm" ,
        rec_index_file => "$filename.rec" ,
        dat_index_file => "$filename.dat" ,
    }
}

sub indexopen {
    my ($tape_file, $flag, $mode) = @_;
    my (%admh,$admh);
    my (%rech,$rech);
    my (%idsh,$idsh);
    my (%dath,$dath);
    $mode = 0644 unless $mode;

    my $files = &_get_index($tape_file);

    my $this = bless {} , 'XML::Tape::Index';
    $this->{mode}      = $flag;

    if ($flag eq 'w') {
        $flag = O_CREAT | O_RDWR;
    }
    elsif ($flag eq 'r') {
        $flag = O_RDONLY;
    }
    else {
        die "usage: indexopen(\$tape_file, 'r' | 'w')";
    }

    my $f_hash = new DB_File::HASHINFO;
    $f_hash->{cachesize} = $XML::Tape::Index::CACHE_SIZE;
    my $f_btree = new DB_File::BTREEINFO;
    $f_btree->{cachesize} = $XML::Tape::Index::CACHE_SIZE;
    $f_btree->{flags} = R_DUP;

    $admh = tie %admh, 'DB_File' ,  $files->{adm_index_file} , $flag, $mode, $f_hash
                   || die "can't tie " . $files->{adm_index_file} . ": $!";
    $rech = tie %rech, 'DB_File' ,  $files->{rec_index_file} , $flag, $mode, $f_hash
                   || die "can't tie " . $files->{rec_index_file} . ": $!";
    $dath = tie %dath, 'DB_File' ,  $files->{dat_index_file} , $flag, $mode, $f_btree
                   || die "can't tie " . $files->{dat_index_file} . ": $!";

    $this->{tape_file} = $tape_file;
    $this->{admh}      = $admh;
    $this->{rech}      = $rech;
    $this->{dath}      = $dath;
    $this->{t_admh}    = \%admh;
    $this->{t_rech}    = \%rech;
    $this->{t_dath}    = \%dath;
    return $this;
}

sub reindex {
    my ($this) = @_;

    die "reindex: only allowed in 'w' mode" unless ($this->{mode} eq 'w');

    my $num_of_rec = 0;
    my $tape = XML::Tape::tapeopen($this->{tape_file}, 'r') || return undef;

    my $_start = time();
    my $earliest_datestamp = undef;
    while (my $record = $tape->get_record()) {
        $num_of_rec++;
        my $id     = $record->getIdentifier();
        my $date   = $record->getDate();
        my $start  = $record->getStartByte();
        my $length = $record->getEndByte() - $start;
        my $value  = join("\t", $id, $date, $start, $length);
        my $key    = md5($id);
        $this->{rech}->put($key,$value);
        $this->{dath}->put($date,$key);

        if ($XML::Tape::Index::VERBOSE && $num_of_rec % 10000 == 0) {
            my $speed = int($num_of_rec/(time - $_start + 1));
            print "record: $num_of_rec ($speed r/s) read: " . $record->getEndByte() . " bytes\n";
        }

        my $comp_date = $date; $comp_date =~ s/\D+//g;
        if ( ! defined $earliest_datestamp || $earliest_datestamp->{val} > $comp_date ) {
            $earliest_datestamp->{val} = $comp_date;
            $earliest_datestamp->{str} = $date;
        }
    }
    $tape->tapeclose();

    $this->{admh}->put('tapefile', $this->{tape_file});
    $this->{admh}->put('recnum', $num_of_rec);
    $this->{admh}->put('earliest', $earliest_datestamp->{str});

    return $num_of_rec;
}

sub list_identifiers {
    my ($this) = shift;
    my ($from,$until,$md5);
  
    die "list_identifiers: only allowed in 'r' mode" unless ($this->{mode} eq 'r');

    # If we have two arguments we need to filter on 'from' and 'until' date...
    if (@_ == 2) {
        ($from,$until) = @_;
        $this->{'from'}  = $from;
        $this->{'until'} = $until;
    }
    # If we have one argument than it is a resumption token... 
    elsif (@_ == 1) {
        ($from,$md5) = split(/,/,shift,2);
        $md5 = pack("H*",$md5);
        $until = $this->{'until'};
    }
    # Else, we need to return all entries..
    else {
        $from = $until = undef;
        $this->{'from'}  = $from;
        $this->{'until'} = $until;
    }

    my $status;

    if ($md5) {
        $status = $this->{dath}->find_dup($from, $md5);
        $status = $this->{dath}->seq($from, $md5, R_NEXT) if ($status == 0);
    }
    elsif ($from) {
        $status = $this->{dath}->seq($from, $md5, R_CURSOR);
    }
    else {
        $status = $this->{dath}->seq($from, $md5, R_FIRST);
    }

    return undef unless ($status == 0);
    return undef if (defined $until && ($from cmp $until) >= 0);

    my $values;
    $status = $this->{rech}->get($md5,$values);

    return undef unless ($status == 0);

    my (@field) = split(/\t/,$values);
    return {
        'identifier'   => $field[0] ,
        'date'         => $field[1] ,
        'start'        => $field[2] ,
        'length'       => $field[3] ,
        'token'        => $field[1] . "," . unpack("H*",$md5)
    };
}

sub get_earliest_date {
    my ($this, $id) = @_;
    my $values;
    $this->{admh}->get('earliest',$values);
    return $values;
}

sub get_tape_file {
    my ($this, $id) = @_;
    my $values;
    $this->{admh}->get('tapefile',$values);
    return $values;
}

sub get_num_of_records {
    my ($this, $id) = @_;
    my $values;
    $this->{admh}->get('recnum',$values);
    return $values;
}

sub get_identifier {
    my ($this, $id) = @_;
    my $md5 = md5($id);
    my $values;

    die "get_identifier: only allowed in 'r' mode" unless ($this->{mode} eq 'r');

    $this->{rech}->get($md5,$values);

    return undef unless $values;

    my (@field) = split(/\t/,$values);
    return {
        'identifier'   => $field[0] ,
        'date'         => $field[1] ,
        'start'        => $field[2] ,
        'len'          => $field[3] ,
    };
}

sub get_record {
    my ($this, $id) = @_;

    die "get_record: only allowed in 'r' mode" unless ($this->{mode} eq 'r');

    local(*F);
    my $rec = $this->get_identifier($id);
   
    return undef unless $rec;

    my $xml;
    if ($rec->{start} && $rec->{len}) {
        open(F, $this->{tape_file}) || return undef;
        seek(F, $rec->{start}, 0);
        read(F, $xml, $rec->{len});
        close(F);
    }
    return $xml; 
}

sub indexclose {
    my ($this) = @_;

    $this->{admh} = undef;
    $this->{rech} = undef;
    $this->{dath} = undef;
    untie %{$this->{t_admh}};
    untie %{$this->{t_rech}};
    untie %{$this->{t_dath}};
}

sub indexexists {
    my ($filename) = @_;
    
    my $files = &_get_index($filename);

    return (-e $files->{adm_index_file} && -e $files->{rec_index_file} && -e $files->{dat_index_file});
}

sub indexdrop {
    my ($filename) = @_;

    my $files = &_get_index($filename);

    unlink $files->{adm_index_file};
    unlink $files->{rec_index_file};
    unlink $files->{dat_index_file};
}

1;