AI::Categorizer::Collection::Files - One document per file


AI-Categorizer documentation Contained in the AI-Categorizer distribution.

Index


Code Index:

NAME

Top

AI::Categorizer::Collection::Files - One document per file

SYNOPSIS

Top

  my $c = new AI::Categorizer::Collection::Files
    (path => '/tmp/docs/training',
     category_file => '/tmp/docs/cats.txt');
  print "Total number of docs: ", $c->count_documents, "\n";
  while (my $document = $c->next) {
    ...
  }
  $c->rewind; # For further operations

DESCRIPTION

Top

This implements a Collection class in which each document exists as a single file on a filesystem. The documents can exist in a single directory, or in several directories.

METHODS

Top

This is a subclass of the abstract AI::Categorizer::Collection class, so any methods mentioned in its documentation are available here.

new()

Creates a new Collection object and returns it. In addition to the parameters accepted by the superclass, the following parameters are accepted:

path

Indicates a location on disk where the documents can be found. The path may be specified as a string giving the name of a directory, or as a reference to an array of such strings if the documents are located in more than one directory.

recurse

Indicates whether subdirectories of the directory (or directories) in the path parameter should be descended into. If set to a true value, they will be descended into. If false, they will be ignored. The default is false.

AUTHOR

Top

Ken Williams, ken@mathforum.org

COPYRIGHT

Top

SEE ALSO

Top

AI::Categorizer::Collection(3)


AI-Categorizer documentation Contained in the AI-Categorizer distribution.

package AI::Categorizer::Collection::Files;
use strict;

use AI::Categorizer::Collection;
use base qw(AI::Categorizer::Collection);

use Params::Validate qw(:types);
use File::Spec;

__PACKAGE__->valid_params
  (
   path => { type => SCALAR|ARRAYREF },
   recurse => { type => BOOLEAN, default => 0 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{dir_fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a directory, or list of directories
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

sub _next_path {
  my $self = shift;
  closedir $self->{dir_fh} if $self->{cur_dir};

  $self->{cur_dir} = shift @{$self->{path}};
  push @{$self->{used}}, $self->{cur_dir};
  opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!";
}

sub next {
  my $self = shift;
  my $file = $self->_read_file;
  return unless defined $file;
  
  warn "No category information about '$file'" unless defined $self->{category_hash}{$file};
  my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] };

  return $self->call_method('document', 'read', 
			    path => File::Spec->catfile($self->{cur_dir}, $file),
			    name => $file,
			    categories => \@cats,
			   );
}

sub _read_file {
  my ($self) = @_;
  
  my $file = readdir $self->{dir_fh};

  if (!defined $file) { # Directory has been exhausted
    return undef unless @{$self->{path}};
    $self->_next_path;
    return $self->_read_file;
  } elsif ($file eq '.' or $file eq '..') {
    return $self->_read_file;
  } elsif (-d (my $path = File::Spec->catdir($self->{cur_dir}, $file))) {
    push @{$self->{path}}, $path  # Add for later processing
      if $self->{recurse} and !grep {$_ eq $path} @{$self->{path}}, @{$self->{used}};
    return $self->_read_file;
  }
  return $file;
}

sub rewind {
  my $self = shift;
  push @{$self->{path}}, @{$self->{used}};
  @{$self->{used}} = ();
  $self->_next_path;
}

# This should share an iterator with next()
sub count_documents {
    my $self = shift;
    return $self->{document_count} if defined $self->{document_count};
    
    $self->rewind;
    
    my $count = 0;
    $count++ while defined $self->_read_file;

    $self->rewind;
    return $self->{document_count} = $count;
}

1;
__END__