Solstice::StringLibrary - A library of generic string manipulation functions


Solstice documentation Contained in the Solstice distribution.

Index


Code Index:

NAME

Top

Solstice::StringLibrary - A library of generic string manipulation functions

SYNOPSIS

Top

  use StringLibrary qw(truncstr);

  my $str = truncstr("This is a line of text that needs truncating.");

DESCRIPTION

Top

Functions in this library make no assumptions about the content of the string being modified.

Superclass

Exporter

Export

No symbols exported.

Functions

htmltounicode($string)

Returns $string with all ê-like unicode entities packed into perl unicode.

scrubhtml ($string)

Returns $string with all malicious scripts, broken tags, relative links, dynamic css, etc removed. =cut

truncstr($string, $cutoff, $marker)

Returns $string truncated to $cutoff, and appended with an optional cutoff marker (defaults to '...').

truncemail($string, $left_limit, $right_limit, $marker)

Returns $string truncated to $left_limit characters to the left of the first @ sign, $right_limit characters to the right of the last @ sign. It will use $marker as the replacement. Defaults are 20, 30 and '...'.

fixstrlen($string, $cutoff, $marker)

Returns a string of fixed-length. Strings shorter than $cutoff are ignored. Strings longer than $cutoff are transformed as in the following example: Before: This is a long string of text that needs shortening After: This is a long string o...ning

fixlinewidth

Returns a string with breaking spaces inserted.

encode($string, $unsafe_chars)

Returns $string with HTML entities encoded. The string $unsafe_chars specifies which characters to consider unsafe (i.e., which to escape). The default set of characters to encode are control chars, high-bit chars, and the <, &, >, ' and " characters. This function just wraps HTML::Entities::encode_entities.

decode($string)

Returns $string with HTML entities decoded. This function just wraps HTML::Entities::decode.

unrender($string, $convert_whitespace)

Returns $string transformed into a non-HTML-renderable string, by converting '&<"' chars to entities. Numeric entities are ignored. If $convert_whitespace is passed and is true, whitespace chars ' ', \t and \n are converted to HTML approximations.

urlclean

Removes double slashes in urls

htmltotext($string)

$string should contain html. Returns $string with html removed, and replaced with whitespace formatting.

        <ul>
eg:     <li>a   becomes:    * a
        <li>b               * b
        </ul>
=cut




extracttext($string)

$string should contain html. Returns $string with html removed.

convertspaces($string)

Returns $string transformed into a non-breaking HTML line by replacing ' ' with '&nbsp;'.

strtoascii($string)

Changes certain characters (curly quotes, emdash, endash) to their ASCII equivalent.

\x91 curly single quote left \x92 curly single quote right \x93 curly double quote left \x94 curly double quote right \x95 bullet point \x96 emdash \x97 endash \xa9 copyright \x85 elipses • bullet point

strtourl($string)

Returns $string transformed into a safe url, by url-encoding non-word characters.

strtofilename($string, $preserve_whitespace)

Returns $string transformed into a safe file name, by converting spaces to underscores and removing forward slashes. $preserve_whitespace specifies that whitespace should be escaped rather than translated.

strtojavascript($string)

Returns $string transformed into a javascript-safe string, by escaping single- and double-quote characters.

trimstr($string)

Remove leading and trailing whitespace from $string.

scrubcdata($string)

This will return a string with ]]> escaped, so it will be cdata safe.

Modules Used

Exporter, HTML::Entities, HTML::TreeBuilder, HTML::FormatText, Solstice::StripScripts::Parser.

AUTHOR

Top

Catalyst Group, <catalyst@u.washington.edu>

VERSION

Top

$Revision: 2418 $

COPYRIGHT

Top


Solstice documentation Contained in the Solstice distribution.
package Solstice::StringLibrary;

# $Id: StringLibrary.pm 2418 2005-07-28 23:28:31Z mcrawfor $

use 5.006_000;
use strict;
use warnings;
use HTML::Entities;
use HTML::TreeBuilder;
use HTML::FormatText;
use Solstice::StripScripts::Parser;
use Exporter;

our @ISA = qw(Exporter);
our ($VERSION) = ('$Revision: 2418 $' =~ /^\$Revision:\s*([\d.]*)/);

our @EXPORT = qw|htmltounicode truncstr truncemail fixstrlen encode decode unrender scrubhtml convertspaces strtoascii strtourl strtofilename strtojavascript trimstr htmltotext extracttext scrubcdata urlclean fixlinewidth|;
our %EXPORT_TAGS = ( all => [ qw|
    htmltounicode
    truncstr
    truncemail
    fixstrlen
    encode
    decode
    unrender
    scrubhtml
    convertspaces
    strtoascii
    strtourl
    strtofilename
    strtojavascript
    trimstr
    htmltotext
    extracttext
    scrubcdata
    urlclean
    fixlinewidth
| ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );

sub htmltounicode {
    my ($string) = @_;
    return undef unless defined $string;
    $string =~ s/&#(\d*?);/pack('U*', $1)/ge;
    return $string;
}


sub scrubhtml {
    my ($string) = @_;
    return undef unless defined $string;

    my $parser = Solstice::StripScripts::Parser->new({
        AllowSrc     => 1,
        AllowHref    => 1,
        AllowNonHTTP => 1,
    });
    $parser->parse($string);
    $parser->eof;
    return $parser->filtered_document;
}

sub truncstr {
    my ($string, $cutoff, $marker) = @_;
    return undef unless defined $string;
    $cutoff = 30 unless defined $cutoff;
    $marker = '...' unless defined $marker;
    
    return $string if (length($marker) > $cutoff);
    return $string if $cutoff < 0;

    if (length($string) > $cutoff) {
        $string = substr($string, 0, ($cutoff - length($marker))) ;
        $string .= $marker;
    }

    return $string;
}

sub truncemail {
    my $string = shift;
    return unless defined $string;
    my $left_limit = shift || 20;
    my $right_limit = shift || 30;
    my $marker = shift || '...';

    return $string if ($left_limit < 0 || $right_limit < 0);
    return $string if (length ($string) < $left_limit + $right_limit + length($marker));

    my $left_side = substr($string, 0, $left_limit);
    my $right_side = substr($string, -1*$right_limit);

    $left_side =~ /^([^@]{1,$left_limit})/;
    $left_side = $1;

    $right_side =~ /([^@]{1,$right_limit})$/;;
    $right_side = $1;

    return $left_side.$marker.'@'.$marker.$right_side;
}

sub fixstrlen {
    my ($string, $cutoff, $marker) = @_;
    return undef unless defined $string;
    $cutoff = 30 unless defined $cutoff;
    $marker = '...' unless defined $marker;

    return '' if $cutoff <= 0;

    #if the cutoff is too short to do something clean, just force it
    return substr($string, 0, $cutoff) if ((length($marker) + 4) > $cutoff);

    if (length($string) > $cutoff) { 
        $string = substr($string, 0, $cutoff - (length($marker) + 4)) . $marker . substr($string, -4);
    }
    return $string;
}

sub fixlinewidth {
    my ($string, $interval, $marker) = @_;
    
    return undef unless defined $string;
    $interval= 20 unless defined $interval;
    $marker = "<wbr />" unless defined $marker;
    
    return '' if $interval <= 0;
    $string =~   s/(\S{$interval})/$1$marker/g;

    return $string;
}

sub encode {
    my ($string, $unsafe_chars) = @_;
    return HTML::Entities::encode_entities($string, $unsafe_chars);
}

sub decode {
    my ($string) = @_;
    return HTML::Entities::decode($string);
}

sub unrender {
    my ($string, $convert_whitespace)  = @_;
    return undef unless defined $string;

    $string =~ s/&([^#]{1})/&amp;$1/g;
    $string =~ s/</&lt;/g;
    $string =~ s/"/&quot;/g;
    
    return $string unless ($convert_whitespace);

    $string =~ s/\n/<br \/>/g;
    $string =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;/g;
    
    return $string;
}

sub urlclean {
    my $url = shift;
    return $url unless $url;

    $url =~ s/\/+/\//g;
    $url =~ s/:\//:\/\//;
    return $url;
}



sub htmltotext {
    my $string = shift;
    return undef unless defined $string;

    #oh lord, this string replacement thing is so nasty, but
    #one of these html libraries was mangling entities.
    $string =~ s/\&([^;]+)?;/SOLSTICE__REPLACE__TOKEN$1;/g;

    my $tree = HTML::TreeBuilder->new_from_content($string);
    my $formatter = new Solstice::StringLibrary::FormatText(leftmargin => 0, rightmargin => 55);
    $string = $formatter->format($tree);
    $tree->delete();

    $string =~ s/SOLSTICE__REPLACE__TOKEN/\&/g;
    $string =~ s/&nbsp;/ /g;
    return $string;
}

sub extracttext {
    my $string = shift;
    return undef unless defined $string;
     
    $string =~ s/\&([^;]+)?;/SOLSTICE__REPLACE__TOKEN$1;/g;

    my $tree = HTML::TreeBuilder->new_from_content($string);
    $string = Solstice::StringLibrary::ExtractText->new()->format($tree);
    $tree->delete();
    
    $string =~ s/SOLSTICE__REPLACE__TOKEN/\&/g;
    return $string;
}

sub convertspaces {
    my $string = shift;
    return undef unless defined $string;

    $string =~ s/ /&nbsp;/g;
    return $string;
}

sub strtoascii {
    my $string = shift;
    return undef unless defined $string;

    for ($string) {
        tr/\x91\x92\x93\x94\x95\x96\x97\xa9/''""*\-\-C/;
        s/•/*/g;
        s/\x85/.../g;
    }
    return $string;
}

sub strtourl {
    my $string = shift;
    return undef unless defined $string;

    $string =~ s/(\W)/sprintf("%%%x", ord($1))/eg;
    return $string;
}

sub strtofilename {
    my ($string, $preserve_whitespace) = @_;
    return undef unless defined $string;

    my $replace = ($preserve_whitespace) ? "\\ " : '_';    

    for ($string) {
        s/\s/$replace/g;
        s/[\/\?\<\>\\\:\*\|\)\(\']//g;
    }
    return $string;
}

sub strtojavascript {
    my $string = shift;
    return undef unless defined $string;

    for ($string) {
        s/&#39;/'/g;
        #XXX well - removing this seems to clear up a lot of double-escaping we're seeing. hope it doesn't break anything.
#        s/\\/\\\\/g;
        s/"/\\"/g;
        s/'/\\'/g;
        s/[\n\r]//g;
    }
    return $string;
}

sub trimstr {
    my $string = shift;
    return undef unless defined $string;

    for ($string) {
        s/^(?:\s|&#09;|&#10;|&#13;|&#32;)+//;
        s/(?:\s|&#09;|&#10;|&#13;|&#32;)+$//;
    }
    return $string;
}

sub scrubcdata {
    my $string = shift;
    return undef unless defined $string;

    $string =~ s/]]>/]]&gt;/g;
    return $string;
}


package Solstice::StringLibrary::ExtractText;

use base qw(HTML::Formatter);

## no critic 
#this little section is determined by a superclass, doesn't fit our style guidlines

sub pre_out {
    my $self = shift;
    my $text = shift;
    $self->collect($text);
}

sub out {
    my $self = shift;
    my $text = shift;
    unless ($text =~ /^\s*$/) {
        $self->collect($text.' ');
    }
}

sub img_start {
    my ($self, $node) = @_;
    my $alt = $node->attr('alt');
    $alt = (defined $alt && $alt ne '') ? ": $alt" : '';
    $self->collect('[IMAGE'.$alt.'] ');
}

sub adjust_lm {} 
sub adjust_rm {}

## use critic

#this exists just to remove the line that corrupts some text for us
package Solstice::StringLibrary::FormatText;

use base qw(HTML::FormatText);

sub out
{
    my $self = shift;
    my $text = shift;

    #here's the culprit
#    $text =~ tr/\xA0\xAD/ /d;

    if ($text =~ /^\s*$/) {
    $self->{hspace} = 1;
    return;
    }

    if (defined $self->{vspace}) {
    if ($self->{out}) {
        $self->nl while $self->{vspace}-- >= 0;
        }
    $self->goto_lm;
    $self->{vspace} = undef;
    $self->{hspace} = 0;
    }

    if ($self->{hspace}) {
        if ($self->{curpos} + length($text) > $self->{rm}) {
            # word will not fit on line; do a line break
            $self->nl;
            $self->goto_lm;
        } else {
            # word fits on line; use a space
            $self->collect(' ');
            ++$self->{curpos};
        }
        $self->{hspace} = 0;
    }

    $self->collect($text);
    my $pos = $self->{curpos} += length $text;
    $self->{maxpos} = $pos if $self->{maxpos} < $pos;
    $self->{'out'}++;
}


1;

__END__