CPAN::Index::Loader - Populates the CPAN index SQLite database


CPAN-Index documentation Contained in the CPAN-Index distribution.

Index


Code Index:

NAME

Top

CPAN::Index::Loader - Populates the CPAN index SQLite database

DESCRIPTION

Top

This package implements all the functionality required to download the CPAN index data, parse it, and populate the SQLite database file.

Because it involves loading a number of otherwise unneeded modules, this package is not loaded by default with the rest of CPAN::Index, but may be loaded on-demand if needed.

METHODS

Top

new

  my $loader = CPAN::Index::Loader->new(
      remote_uri => 'http://search.cpan.org/CPAN',
      local_dir  => '/tmp/cpanindex',
      );

cache

The cache accessor returns a CPAN::Cache object that represents the CPAN cache.

remote_uri

The remote_uri accessor return a URI object for the location of the CPAN mirror.

local_dir

The local_dir accessor returns the filesystem path for the root directory of the local CPAN file cache.

local_file

  my $path = $loader->local_file('01mailrc.txt.gz');

The local_file method takes the name of a file in the CPAN and returns the local path to the file.

Returns a path string, or throws an exception on error.

local_handle

  my $path = $loader->local_handle('01mailrc.txt.gz');

The local_handle method takes the name of a file in the CPAN and returns an IO::Handle to the file.

Returns an IO::Handle, most likely an IO::Handle, or throws an exception on error.

load_index

The load_index takes a single param of the schema to load, locates the three main index files based on the local_dir path, and then loads the index from those files.

Returns the total number of records added.

load_authors

  CPAN::Index::Loader->load_authors( $schema, $handle );

The load_authors method populates the package table from the CPAN 01mailrc.txt.gz file.

The author table in the SQLite database should already be empty before this method is called.

Returns the number of authors added to the database, or throws an exception on error.

load_packages

  CPAN::Index::Loader->load_packages( $schema, $handle );

The load_packages method populates the package table from the CPAN 02packages.details.txt.gz file.

The package table in the SQLite database should already be empty before this method is called.

Returns the number of packages added to the database, or throws an exception on error.

SUPPORT

Top

Bugs should be reported via the CPAN bug tracker

http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Index

For other issues, contact the author.

AUTHOR

Top

Adam Kennedy <cpan@ali.as>

Parts based on various modules by Leon Brocard <acme@cpan.org>

SEE ALSO

Top

Related: CPAN::Index, CPAN

Based on: Parse::CPAN::Authors, Parse::CPAN::Packages

COPYRIGHT

Top


CPAN-Index documentation Contained in the CPAN-Index distribution.
package CPAN::Index::Loader;

use strict;
use Carp           ();
use IO::File       ();
use IO::Zlib       ();
use Params::Util   qw{ _INSTANCE _HANDLE };
use Email::Address ();
use CPAN::Cache    ();

use vars qw{$VERSION};
BEGIN {
	$VERSION = '0.01';
}





#####################################################################
# Constructor and Accessors

sub new {
	my $class = shift;
	my $self  = bless { @_ }, $class;

	# Create the cache object
	unless ( $self->cache ) {
		my @params = ();
		$self->{cache} = CPAN::Cache->new(
			remote_uri => delete($self->{remote_uri}),
			local_dir  => delete($self->{local_dir}),
			trace      => $self->{trace},
			verbose    => $self->{verbose},
			);
	}

	$self;
}

sub cache {
	$_[0]->{cache};
}

sub remote_uri {
	$_[0]->cache->remote_uri;
}

sub local_dir {
	$_[0]->cache->local_dir;
}

sub local_file {
	$_[0]->cache->file($_[1])->path;
}

sub local_handle {
	my $self = shift;
	my $file = $self->local_file(shift);
	$file =~ /\.gz$/
		? IO::Zlib->new( $file, 'rb' ) # [r]ead [b]inary file
		: IO::File->new( $file );
}





#####################################################################
# Main Methods

sub load_index {
	my $self    = shift;
	my $schema  = shift;
	my $created = 0;

	# Load the files
	$created += $self->load_authors(
		$schema,
		$self->local_handle('authors/01mailrc.txt') ||
		$self->local_handle('authors/01mailrc.txt.gz'),
		);
	$created += $self->load_packages(
		$schema,
		$self->local_handle('modules/02packages.details.txt') ||
		$self->local_handle('modules/02packages.details.txt.gz'),
		);

	# Return the total
	$created;
}





#####################################################################
# Parsing Methods

sub load_authors {
	my $self   = shift;
	my $schema = _INSTANCE(shift, 'DBIx::Class::Schema')
		or Carp::croak("Did not provide a DBIx::Class::Schema param");
	my $handle = _HANDLE(shift)
		or Carp::croak("Did not provide a file handle param");

	# Wrap the actual method in a DBIx::Class transaction
	my $created = 0;
	
	my $rs = eval {
		$schema->txn_do( sub {
			$created = $self->_load_authors( $schema, $handle );
		} );
	};
	if ( $@ =~ /Rollback failed/ ) {
		Carp::croak("Rollback failed, database may be corrupt");
	} elsif ( $@ ) {
		Carp::croak("Database error while loading authors: $@");
	}

	$created;	
}
	
sub _load_authors {
	my ($self, $schema, $handle) = @_;

	# Every email address should be different, so disable
	# Email::Address caching so we don't waste a bunch of memory.
	local $Email::Address::NOCACHE = 1;

	# Process the author records
	my $created = 0;
	while ( my $line = $handle->getline ) {
		# Parse the line
		unless ( $line =~ /^alias\s+(\S+)\s+\"(.+)\"[\012\015]+$/ ) {
			Carp::croak("Invalid 01mailrc.txt.gz line '$line'");
		}
		my $id    = $1;
		my $email = $2;

		# Parse the full email address to seperate the parts
		my @found = Email::Address->parse($email);
		unless ( @found ) {
			# Invalid email or something that Email::Address can't handle.
			# Use a default name and address for now.
			@found = Email::Address->parse( "$id <$id\@cpan.org>" );
		}

		# Some CPAN users have multiple addresses, for example
		# A. PREM ANAND <prem_and@rediffmail.com,prem@ncbs.res.in>
		# When this happens, we'll just take the first one.

		# Create the record
		$schema->resultset('Author')->create( {
			id    => $id,
			name  => $found[0]->name,
			email => $found[0]->address,
			} );
		$created++;

		# Debugging
		#if ( $Test::More::VERSION ) {
		#	Test::More::diag("$created...");
		#}
	}

	$created;
}

sub load_packages {
	my $self   = shift;
	my $schema = _INSTANCE(shift, 'DBIx::Class::Schema')
		or Carp::croak("Did not provide a DBIx::Class::Schema param");
	my $handle = _HANDLE(shift)
		or Carp::croak("Did not provide a file handle param");

	# Advance past the header, to the first blank line
	while ( my $line = $handle->getline ) {
		last if $line !~ /[^\s\012\015]/;
	}

	# Wrap the database method in a DBIx::Class transaction
	my $created;
	my $rs = eval {
		$schema->txn_do( sub {
			$created = $self->_load_packages( $schema, $handle );
		} );
	};
	if ( $@ =~ /Rollback failed/ ) {
		Carp::croak("Rollback failed, database may be corrupt");
	} elsif ( $@ ) {
		Carp::croak("Database error while loading packages: $@");
	}

	$created;
}

sub _load_packages {
	my ($self, $schema, $handle) = @_;

	# Process the author records
	my $created = 0;
	while ( my $line = $handle->getline ) {
		unless ( $line =~ /^(\S+)\s+(\S+)\s+(.+?)[\012\015]+$/ ) {
			Carp::croak("Invalid 02packages.details.txt.gz line '$line'");
		}
		my $name    = $1;
		my $version = $2 eq 'undef' ? undef : $2;
		my $path    = $3;

		# Create the record
		$schema->resultset('Package')->create( {
			name    => $name,
			version => $version,
			path    => $path,
			} );
		$created++;

		# Debugging
		#if ( $Test::More::VERSION ) {
		#	Test::More::diag("$created...");
		#}
	}

	$created;
}

1;