Text::Original - Find original, non-quoted text in a message


Text-Original documentation Contained in the Text-Original distribution.

Index


Code Index:

NAME

Top

Text::Original - Find original, non-quoted text in a message

SYNOPSIS

Top

    use Text::Original;
    my $sentence = first_sentence($email->body);

FUNCTIONS

Top

first_lines

    first_lines($text, 20);

Returns the a number of lines after the first non blank, none quoted line of the body of the email.

It will guess at attribution lines and skip them as well.

It will return super cited lines. This is the super-citers' fault, not ours.

It won't catch all types of attribution lines;

It can optionally be passed a number of lines to get.

first_paragraph

Returns the first original paragraph of the message

first_sentence

Returns the first original sentence of the message

EXPORTS

Top

All of the above.

AUTHOR

Top

Simon Wistow and the Mariachi project. See http://siesta.unixbeard.net/

Packaged by Simon Cozens

Currently maintained by Simon Wistow <simon@thegestalt.org>

COPYRIGHT

Top


Text-Original documentation Contained in the Text-Original distribution.
package Text::Original;
use 5.006; use strict; use warnings;
use Memoize;


our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw( first_lines first_paragraph first_sentence) ] ); 
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = ( @{ $EXPORT_TAGS{'all'} } );

our $VERSION = '1.4';

sub first_lines {
    my $text = shift;
    my $num  = shift || 1;

    return _significant_signal($text, lines => $num);
}


sub first_paragraph {
    return _significant_signal(shift, para => 1);
}

sub first_sentence {
    my $text = first_paragraph(shift);
    $text =~ s/([.?!]) .*/$1/s; 
    return $text;
}

# Kudos to Damian Conway for this bit.
my $quotechar = qq{[!#%=|:]};
my $quotechunk = qq{(?:$quotechar(?![a-z])|[a-z]*>+)};
my $quoter = qq{(?:(?i)(?:$quotechunk(?:[ \\t]*$quotechunk)*))};

sub _significant_signal {
    my $text = shift;
    my %opts = @_;

    my $return = "";
    my $lines  = 0;

    # get all the lines from the main part of the body
    my @lines = split /$/m, $text;

    # right, find the start of the original content or quoted
    # content (i.e. skip past the attributation)
    my $not_started = 1;
    while (@lines && $not_started) {
        # next line
        local $_ = shift @lines;
        #print "}}$_";

        # blank lines, euurgh
        next if /^\s*$/;
        # quotes (we don't count quoted From's)
        next if /^\s*>(?!From)/;
        # Other kinds of quoter:
        next if /^\s*$quoter/;
        # skip obvious attribution
        next if /^\s*On (Mon|Tue|Wed|Thu|Fri|Sat|Sun)/i;
        next if /\d{4}-?\w{2,3}-?\d{2}.*\d+:\d+:\d+/i; # Looks like a date
        next if /^\w+(\s\w+)?:$/; # lathos' minimalist attributions. :)
        next if /^\s*.+=? wrote:/i;

        # skip signed messages
        next if /^\s*-----/;
        next if /^Hash:/;

        # annoying hi messages (this won't work with i18n)
        next if /^\s*(?:hello|hi|hey|greetings|salut
                                                |good (?:morning|afternoon|day|evening))
                                  (?:\W.{0,14})?\s*$/ixs;

        # snips
        next if m~\s*                          # whitespace
                                    [<.=-_*+({\[]*?              # opening bracket
                                    (?:snip|cut|delete|deleted)  # snip?
                                    [^>}\]]*?                    # some words?
                                    [>.=-_*+)}\]]*?              # closing bracket
                                  \s*$                          # end of the line
                                  ~xi;

        # [.. foo ..] or ...foo.. or so on
        next if m~\s*\[?\.\..*?\.\.]?\s*$~;

        # ... or [...]
        next if m~\s*\[?\.\.\.]?\s*$~;

        # if we got this far then we've probably got past the
        # attibutation lines
        unshift @lines, $_;  # undo the shift
        undef $not_started;  # and say we've started.
    }

    # okay, let's _try_ to build up some content then
    foreach (@lines) {
        # are we at the end of a paragraph?
        last if (defined $opts{'para'}  # paragraph mode?
                 && $opts{'para'}==1
                 && $lines>0            # got some lines aready?
                 && /^\s*$/);           # and now we've found a gap?

        # blank lines, euurgh
        next if /^\s*$/;
        # quotes (we don't count quoted From's)
        next if /^\s*>(?!From)/;

        # if we got this far then the line was a useful one
        $lines++;

        # sort of munged Froms
        s/^>From/From/;
        s/^\n+//;
        $return .= "\n" if $lines>1;
        $return .= $_;
        last if (defined $opts{'lines'} && $opts{'lines'}==$lines);
    }
    return $return;
}

memoize('_significant_signal');

1;