String::Tokeniser - Perl extension for, uhm, tokenising strings.


String-Tokeniser documentation Contained in the String-Tokeniser distribution.

Index


Code Index:

NAME

Top

String::Tokeniser - Perl extension for, uhm, tokenising strings.

SYNOPSIS

Top

  use String::Tokeniser;

DESCRIPTION

Top

String::Tokeniser provides an interface to a tokeniser class, allowing one to manipulate strings on a token-by-token basis without having to keep track of list element numbers and so on.

CONSTRUCTOR

Top

new ( $sentence, [0|-1|$regexp], [$exception...] )

Create a String::Tokeniser, tokenises $sentence and resets the token counter.

The next argument determines how a ``token'' is defined: a value of 0 or undef determines that underscores are included in a token; -1 states that they are not. Alternatively, you can supply your own regular expression which will be fed to a split to determine the tokens.

Then may optionally follow a list of exceptions: tokens that would be split in two, but should be treated as one.

METHODS

Top

moretokens

Tells you if you have any more tokens left to deal with.

skiptoken([n])

Move the `pointer' forward one (or n) tokens.

thistoken

Return the current token; that is, the token under the `pointer'.

lasttoken

Return the previous token; that is, the one just past the `pointer'.

gettoken

Equivalent to skiptoken;gettoken - the usual way of grabbing the next token in the list in turn.

nexttoken

Looks ahead one token, but does not change the `pointer' position.

lookahead([n])

Returns a string composed of the next n tokens, but does not change the `pointer' position.

gimme($string)

Assuming a string of tokens will end in $string, returns everything from the current `pointer' position until the string is found. Returns a two-element list: firsly, why the search terminated, (either EOF meaning we hit the end of the token list without success, or FOUND meaning $string was found.) and the rest of the tokens upto and including $string (or the end of the list, whichever was soonest).

save

Saves one's pointer position. Can be used multiply as a save stack.

restore

Restores a previously saved position.

FEATURES

Top

At present, there is no support for exceptions which spread over three or more tokens, although this is planned.

AUTHOR

Top

Originaly written by Simon Cozens; Maintained by Alberto Simões <ambs@cpan.org>

SEE ALSO

Top

WEBPerl::Changetie


String-Tokeniser documentation Contained in the String-Tokeniser distribution.
package String::Tokeniser;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
use Carp;

require Exporter;

@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
@EXPORT_OK = qw();
$VERSION = '0.04';


# Preloaded methods go here.

# Autoload methods go after =cut, and are processed by the autosplit program.

1;

sub new {
  my $classname = shift;
  my $self = {};
  bless($self, $classname);
  my $sentence = shift;
  carp "! Nothing to tokenise" unless defined $sentence;
  my $style = shift || 0;
  my @list;
  if ($style==-1) {
    $style= "(?<=[^a-zA-Z0-9])|(?=[^a-zA-Z0-9])";
  } elsif ($style) {
  } else {
    $style="(?<=[^a-zA-Z0-9_])|(?=[^a-zA-Z0-9_])";
  }
  $self->{STYLE} = $style;
  @list = split /$style/, $sentence;
  $self->{LIST} = \@list;
  $self->{COUNT} = 0;
  $self->{STACK} = [];
  $self->_except(@_); # Exception handler. Is not fun.

  return($self);
}

sub moretokens { my $self = shift; 
	return ($self->{COUNT} <= $#{$self->{LIST}})
}

sub skiptoken { my $self=shift; my $howmany=shift;
	$howmany=1 unless defined $howmany;
	$self->{COUNT}+=$howmany;
}

sub thistoken { my $self=shift;
	return $self->{LIST}->[$self->{COUNT}];
}

sub lasttoken { my $self=shift;
	return $self->{LIST}->[$self->{COUNT}-1];
}

sub gettoken { my $self=shift; 
$self->skiptoken(); return $self->lasttoken();}

sub nexttoken { my $self=shift;
	return $self->{LIST}->[$self->{COUNT}+1];
}

sub lookahead { my $self=shift;
	my $howmany=shift;
	croak "Silly value in lookahead" if $howmany <=1;
	my $ret="";
	for (my $i=$self->{COUNT}; $i<$self->{COUNT}+$howmany; $i++)
		{ $ret.= $self->{LIST}->[$i] }
	return $ret;
}

sub gimme { my ($self,$expectation)=(shift,shift);
	my $why="EOF"; my $retval="";
	while ($self->moretokens()) {
		$retval.=$self->gettoken();
		if (substr($retval,-length($expectation)) eq $expectation) {
			$why="FOUND";
			last
		} 
	}
	return ($why, $retval);
}

sub save { my $self=shift;
	push @{$self->{STACK}}, $self->{COUNT};
}

sub restore { my $self=shift; my $temp;
	$self->{COUNT}=$temp if $temp = pop @{$self->{STACK}};
}

# I have no idea how this works any more. And I've *only just* written
# it.
#                 -- Simon Cozens
#
# But it is correct, and simple! You just need to indent it correctly.
#
#                 -- Alberto Simoes

sub _except {
  my $self = shift;
  my $style = $self->{STYLE};
  my %decide;
  my $listref=$self->{LIST};
  my @res;

  while($_ = shift) {   # was foreach(shift) {
    my($left, $right) = split /$style/;
    push @{$decide{$left}}, $right;
  }

  @_ = @$listref;
  while (@_) {
    my($first,$second) = (shift, shift || "");
    if (grep { $first eq $_ and scalar(grep { $second eq $_ } @{$decide{$_}}) } keys %decide ) {
      # I think
      push(@res, $first.$second);
    } else {
      push @res,$first;
      if (grep { $second eq $_ } keys %decide) {
	unshift(@_, $second);
      } else {
	push @res, $second;
      }
    }
  }
  $self->{LIST}=\@res;
  return $self;
}

# sub ishere { return 1 }