Hailo::Tokenizer::Words - A tokenizer for L<Hailo|Hailo> which splits


Hailo documentation Contained in the Hailo distribution.

Index


Code Index:

NAME

Top

Hailo::Tokenizer::Words - A tokenizer for Hailo which splits on whitespace and word boundaries, mostly.

DESCRIPTION

Top

This tokenizer does its best to handle various languages. It knows about most apostrophes, quotes, and sentence terminators.

AUTHOR

Top

Hinrik Örn Sigurðsson, hinrik.sig@gmail.com

LICENSE AND COPYRIGHT

Top


Hailo documentation Contained in the Hailo distribution.

package Hailo::Tokenizer::Words;
BEGIN {
  $Hailo::Tokenizer::Words::AUTHORITY = 'cpan:AVAR';
}
BEGIN {
  $Hailo::Tokenizer::Words::VERSION = '0.69';
}

use 5.010;
use utf8;
use Any::Moose;
use Any::Moose 'X::StrictConstructor';
use Regexp::Common qw/ URI /;
use namespace::clean -except => 'meta';

with qw(Hailo::Role::Arguments
        Hailo::Role::Tokenizer);

# [[:alpha:]] doesn't match combining characters on Perl >=5.12
my $ALPHABET   = qr/(?![_\d])\w/;

# tokenization
my $SPACE      = qr/\s/;
my $NONSPACE   = qr/\S/;
my $DASH       = qr/[–-]/;
my $POINT      = qr/[.,]/;
my $APOSTROPHE = qr/['’´]/;
my $ELLIPSIS   = qr/\.{2,}|…/;
my $NON_WORD   = qr/[^\w\s]+/;
my $BARE_WORD  = qr/\w+/;
my $CURRENCY   = qr/[¤¥¢£\$]/;
my $NUMBER     = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?|$CURRENCY?\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?(?!\d|$ALPHABET)/;
my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
my $ABBREV     = qr/$ALPHABET(?:\.$ALPHABET)+\./;
my $DOTTED     = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
my $WORD_TYPES = qr/$NUMBER|$ABBREV|$DOTTED|$APOST_WORD|$BARE_WORD/;
my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET|$NUMBER)/;
my $WORD       = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+|$DASH(?!$DASH))?/;
my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} /x;
my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?<!I') (?: \p{Upper}* \p{Lower} ) /x;

# special tokens
my $TWAT_NAME  = qr/ \@ [A-Za-z0-9_]+ /x;
my $EMAIL      = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ \. [A-Z]{2,4} /xi;
my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* | \w+ (?: :: \w+ )+ ) (?: :: )? | \w+ :: /x;
my $EXTRA_URI  = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
my $ESC_SPACE  = qr/(?:\\ )+/;
my $NAME       = qr/(?:$BARE_WORD|$ESC_SPACE)+/;
my $FILENAME   = qr/ $NAME? \. $NAME (?: \. $NAME )* | $NAME/x;
my $UNIX_PATH  = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
my $WIN_PATH   = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
my $PATH       = qr/$UNIX_PATH|$WIN_PATH/;
my $DATE       = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
my $TIME       = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z| ?[AP]M|[-+±][0-9]{2}(?::?[0-9]{2})?)?/i;
my $DATETIME   = qr/${DATE}T$TIME/;
my $IRC_NICK   = qr/<[ @%+~&]?[A-Za-z_`\-^\|\\\{}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/;
my $IRC_CHAN   = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
my $NUMERO     = qr/#[0-9]+/;
my $CLOSE_TAG  = qr{</[-\w]+>};

my $CASED_WORD = qr/$CLOSE_TAG|$IRC_NICK|$IRC_CHAN|$DATETIME|$DATE|$TIME|$PERL_CLASS|$EXTRA_URI|$EMAIL|$TWAT_NAME|$PATH|$NUMERO/;

# capitalization
# The rest of the regexes are pretty hairy. The goal here is to catch the
# most common cases where a word should be capitalized. We try hard to
# guard against capitalizing things which don't look like proper words.
# Examples include URLs and code snippets.
my $OPEN_QUOTE  = qr/['"‘“„«»「『‹‚]/;
my $CLOSE_QUOTE = qr/['"’“”«»」』›‘]/;
my $TERMINATOR  = qr/(?:[?!‽]+|(?<!\.)\.)/;
my $ADDRESS     = qr/:/;
my $PUNCTUATION = qr/[?!‽,;.:]/;
my $BOUNDARY    = qr/$CLOSE_QUOTE?(?:\s*$TERMINATOR|$ADDRESS)\s+$OPEN_QUOTE?\s*/;
my $LOOSE_WORD  = qr/$IRC_CHAN|$DATETIME|$DATE|$TIME|$PATH|$NUMBER|$ABBREV|$APOST_WORD|$NUMERO|$BARE_WORD(?:$DASH(?:$WORD_TYPES|$BARE_WORD)|$APOSTROPHE(?!$ALPHABET|$NUMBER|$APOSTROPHE)|$DASH(?!$DASH{2}))*/;
my $SPLIT_WORD  = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+|$)|$CLOSE_QUOTE|$TERMINATOR|\s+|$)};

# we want to capitalize words that come after "On example.com?"
# or "You mean 3.2?", but not "Yes, e.g."
my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+|\w{2,}))?/;
my $WORD_STRICT   = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;

# input -> tokens
sub make_tokens {
    my ($self, $input) = @_;

    my @tokens;
    $input =~ s/$DASH\K\s*\n+\s*//;
    $input =~ s/\s*\n+\s*/ /gm;

    while (length $input) {
        # remove the next chunk of whitespace
        $input =~ s/^$SPACE+//;
        my $got_word;

        while (length $input && $input =~ /^$NONSPACE/) {
            # We convert it to ASCII and then look for a URI because $RE{URI}
            # from Regexp::Common doesn't support non-ASCII domain names
            my ($ascii) = $input =~ /^($NONSPACE+)/;
            $ascii =~ s/[^[:ascii:]]/a/g;

            # URIs
            if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
                my $uri_end = $+[0];
                my $uri = substr $input, 0, $uri_end;
                $input =~ s/^\Q$uri//;

                push @tokens, [$self->{_spacing_normal}, $uri];
                $got_word = 1;
            }
            # special words for which we preserve case
            elsif (!$got_word && $input =~ s/ ^ (?<word> $CASED_WORD )//xo) {
                push @tokens, [$self->{_spacing_normal}, $+{word}];
                $got_word = 1;
            }
            # normal words
            elsif ($input =~ / ^ $WORD /xo) {
                my $word;

                # special case to allow matching q{ridin'} as one word, even when
                # it appears as q{"ridin'"}, but not as q{'ridin'}
                my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
                if (!@tokens && $input =~ s/ ^ (?<word>$WORD_APOST) //xo
                    || $last_char =~ / ^ $APOSTROPHE $ /xo
                    && $input =~ s/ ^ (?<word>$WORD_APOST) (?<! $last_char ) //xo) {
                    $word = $+{word};
                }
                else {
                    $input =~ s/^($WORD)//o and $word = $1;
                }

                # Maybe preserve the casing of this word
                $word = lc $word
                    if $word ne uc $word
                    # Mixed-case words like "WoW"
                    and $word !~ $MIXED_CASE
                    # Words that are upper case followed by a non-word character.
                    and $word !~ $UPPER_NONW;

                push @tokens, [$self->{_spacing_normal}, $word];
                $got_word = 1;
            }
            # everything else
            elsif ($input =~ s/ ^ (?<non_word> $NON_WORD ) //xo) {
                my $non_word = $+{non_word};
                my $spacing = $self->{_spacing_normal};

                # was the previous token a word?
                if ($got_word) {
                    $spacing = $input =~ /^$NONSPACE/
                        ? $self->{_spacing_infix}
                        : $self->{_spacing_postfix};
                }
                # do we still have more tokens?
                elsif ($input =~ /^$NONSPACE/) {
                    $spacing = $self->{_spacing_prefix};
                }

                push @tokens, [$spacing, $non_word];
            }
        }
    }

    return \@tokens;
}

# tokens -> output
sub make_output {
    my ($self, $tokens) = @_;
    my $reply = '';

    for my $pos (0 .. $#{ $tokens }) {
        my ($spacing, $text) = @{ $tokens->[$pos] };
        $reply .= $text;

        # append whitespace if this is not a prefix token or infix token,
        # and this is not the last token, and the next token is not
        # a postfix/infix token
        if ($pos != $#{ $tokens }
            && $spacing != $self->{_spacing_prefix}
            && $spacing != $self->{_spacing_infix}
            && !($pos < $#{ $tokens }
                && ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
                || $tokens->[$pos+1][0] == $self->{_spacing_infix})
                )
            ) {
            $reply .= ' ';
        }
    }

    # capitalize the first word
    $reply =~ s/^\s*$OPEN_QUOTE?\s*\K($SPLIT_WORD)(?=$ELLIPSIS|(?:(?:$CLOSE_QUOTE|$TERMINATOR|$ADDRESS|$PUNCTUATION+)?(?:\s|$)))/\u$1/o;

    # capitalize the second word
    $reply =~ s/^\s*$OPEN_QUOTE?\s*$SPLIT_WORD(?:(?:\s*$TERMINATOR|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;

    # capitalize all other words after word boundaries
    # we do it in two passes because we need to match two words at a time
    $reply =~ s/(?:$ELLIPSIS|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/\x1B\u$1\x1B/go;
    $reply =~ s/\x1B$WORD_STRICT\x1B$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
    $reply =~ s/\x1B//go;

    # end paragraphs with a period when it makes sense
    $reply =~ s/(?:$ELLIPSIS|\s+|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;

    # capitalize I'm, I've...
    $reply =~ s{(?:(?:$ELLIPSIS|\s+)|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;

    return $reply;
}

__PACKAGE__->meta->make_immutable;