/usr/local/CPAN/DSlib/DS/Importer/TabStream.pm


#!perl

# ########################################################################## #
# Title:         Tabular stream to datastream importer
# Creation date: 2007-03-05
# Author:        Michael Zedeler
# Description:   Produces a datastream from a tab separated stream (IO::Handle)
#                Data Stream class
#                Data importer
# File:          $Source: /data/cvs/lib/DSlib/lib/DS/Importer/TabStream.pm,v $
# Repository:    kronhjorten
# State:         $State: Exp $
# Documentation: inline
# Recepient:     -
# TODO Must store field order internally.
# ########################################################################## #

package DS::Importer::TabStream;

use base qw{ DS::Importer };

use strict;
use Carp::Assert;

our ($VERSION) = $DS::VERSION;
our ($REVISION) = '$Revision: 1.1 $' =~ /(\d+\.\d+)/;

require DS::TypeSpec;

# Important: calling this method with no typespec will make
# the class read a header from the stream and use it
# to create a typespec.
sub new {
    my( $class, $fh, $target, $typespec, $field_order, $row ) = @_;

    assert($fh->isa('IO::Handle'));

    my @fields;
    if( not defined( $typespec ) ) {
        # Get header from stream to create type spec from

        assert( (not defined $field_order), 
            "Can't handle specifying field order if fields derived from stream header." );

        # Get first non-comment line in stream
        my $line = getline($fh);
        $line =~ s/[\n\r]+$//;
        $line =~ s/(\s+|$)/\n/g;

        my @fields = split /\s+/, $line;

        $typespec = new DS::TypeSpec( [ @fields ] );
        
        $field_order = [@fields];
    }

    my $self = $class->SUPER::new( $typespec, $target, $row );

    $self->{field_order} = $field_order;

    $self->{fh} = $fh;

    return $self;
}

sub _fetch {
    my($self) = @_;

    my $result = undef;

    unless($self->{fh}->eof()) {
        
        my ($line) = getline($self->{fh}) =~ /^([^\n\r]+)/;
        my (@line) = split /\t/, $line;

        # Replace values that are not defined (because the field was empty or not there)
        # with empty strings.
        # This is a design decision: we could check whether each line has exactly the right
        # number of fields. The problem is that Microsoft Excel truncates trailing empty fields
        # on each line, which would trigger spurious errors.
        for(my $i = 0; $i <= $#{$self->{field_order}}; $i++) {
            $line[$i] = '' unless defined( $line[$i] );
        }

        @{$self->{row}}{@{$self->{field_order}}} = @line;
        
        $result = 1;
    }

    return $result ? $self->{row} : undef;
}

# Functions (not methods!)

sub getline {
    my( $fh ) = @_;

    my $line;
    do {
         $line = $fh->getline();
    } while( $line =~ /^#/ and not $fh->eof());

    return $line;
}

1;