| HTML-Format documentation | Contained in the HTML-Format distribution. |
HTML::FormatText - Format HTML as plaintext
version 2.07
use HTML::TreeBuilder;
$tree = HTML::TreeBuilder->new->parse_file("test.html");
use HTML::FormatText;
$formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50);
print $formatter->format($tree);
or, more simply:
use HTML::FormatText;
my $string = HTML::FormatText->format_file(
'test.html',
leftmargin => 0, rightmargin => 50
);
HTML::FormatText is a formatter that outputs plain latin1 text. All character attributes (bold/italic/underline) are ignored. Formatting of HTML tables and forms is not implemented.
HTML::FormatText is built on HTML::Formatter and documentation for that module applies to this - especially new in HTML::Formatter, format_file in HTML::Formatter and format_string in HTML::Formatter.
You might specify the following parameters when constructing the formatter:
The column of the left margin. The default is 3.
The column of the right margin. The default is 72.
See perlmodinstall for information and options on installing Perl modules.
No bugs have been reported.
Please report any bugs or feature requests through the web interface at http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format.
The project homepage is http://search.cpan.org/dist/HTML-Format.
The latest version of this module is available from the Comprehensive Perl Archive Network (CPAN). Visit http://www.perl.com/CPAN/ to find a CPAN site near you, or see http://search.cpan.org/dist/HTML-Format/.
The development version lives at http://github.com/nigelm/html-format and may be cloned from git://github.com/nigelm/html-format.git. Instead of sending patches, please fork this project using the standard git and github infrastructure.
This software is copyright (c) 2011 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas.
This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.
| HTML-Format documentation | Contained in the HTML-Format distribution. |
package HTML::FormatText; # ABSTRACT: Format HTML as plaintext use 5.006_001; use strict; use warnings; # We now use Smart::Comments in place of the old DEBUG framework. # this should be commented out in release versions.... ##use Smart::Comments; use base 'HTML::Formatter'; our $VERSION = '2.07'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # ------------------------------------------------------------------------ sub default_values { ( shift->SUPER::default_values(), lm => 3, # left margin rm => 72, # right margin (actually, maximum text width) ); } # ------------------------------------------------------------------------ sub configure { my ( $self, $hash ) = @_; my $lm = $self->{lm}; my $rm = $self->{rm}; $lm = delete $hash->{lm} if exists $hash->{lm}; $lm = delete $hash->{leftmargin} if exists $hash->{leftmargin}; $rm = delete $hash->{rm} if exists $hash->{rm}; $rm = delete $hash->{rightmargin} if exists $hash->{rightmargin}; my $width = $rm - $lm; if ( $width < 1 ) { warn "Bad margins, ignored" if $^W; return; } if ( $width < 20 ) { warn "Page probably too narrow" if $^W; } for ( keys %$hash ) { warn "Unknown configure option '$_'" if $^W; } $self->{lm} = $lm; $self->{rm} = $rm; $self; } # ------------------------------------------------------------------------ sub begin { my $self = shift; $self->SUPER::begin; $self->{curpos} = 0; # current output position. $self->{maxpos} = 0; # highest value of $pos (used by header underliner) $self->{hspace} = 0; # horizontal space pending flag } # ------------------------------------------------------------------------ sub end { shift->collect("\n"); } # ------------------------------------------------------------------------ sub header_start { my ( $self, $level ) = @_; $self->vspace( 1 + ( 6 - $level ) * 0.4 ); $self->{maxpos} = 0; 1; } # ------------------------------------------------------------------------ sub header_end { my ( $self, $level ) = @_; if ( $level <= 2 ) { my $line; $line = '=' if $level == 1; $line = '-' if $level == 2; $self->vspace(0); $self->out( $line x ( $self->{maxpos} - $self->{lm} ) ); } $self->vspace(1); 1; } # ------------------------------------------------------------------------ sub bullet { my $self = shift; $self->SUPER::bullet( $_[0] . ' ' ); } # ------------------------------------------------------------------------ sub hr_start { my $self = shift; $self->vspace(1); $self->out( '-' x ( $self->{rm} - $self->{lm} ) ); $self->vspace(1); } # ------------------------------------------------------------------------ sub pre_out { my $self = shift; # should really handle bold/italic etc. if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl() while $self->{vspace}-- >= 0; $self->{vspace} = undef; } } my $indent = ' ' x $self->{lm}; my $pre = shift; $pre =~ s/^/$indent/mg; $self->collect($pre); $self->{out}++; } # ------------------------------------------------------------------------ sub out { my $self = shift; my $text = shift; $text =~ tr/\xA0\xAD/ /d; if ( $text =~ /^\s*$/ ) { $self->{hspace} = 1; return; } if ( defined $self->{vspace} ) { if ( $self->{out} ) { $self->nl while $self->{vspace}-- >= 0; } $self->goto_lm; $self->{vspace} = undef; $self->{hspace} = 0; } if ( $self->{hspace} ) { if ( $self->{curpos} + length($text) > $self->{rm} ) { # word will not fit on line; do a line break $self->nl; $self->goto_lm; } else { # word fits on line; use a space $self->collect(' '); ++$self->{curpos}; } $self->{hspace} = 0; } $self->collect($text); my $pos = $self->{curpos} += length $text; $self->{maxpos} = $pos if $self->{maxpos} < $pos; $self->{'out'}++; } # ------------------------------------------------------------------------ sub goto_lm { my $self = shift; my $pos = $self->{curpos}; my $lm = $self->{lm}; if ( $pos < $lm ) { $self->{curpos} = $lm; $self->collect( " " x ( $lm - $pos ) ); } } # ------------------------------------------------------------------------ sub nl { my $self = shift; $self->{'out'}++; $self->{curpos} = 0; $self->collect("\n"); } # ------------------------------------------------------------------------ sub adjust_lm { my $self = shift; $self->{lm} += $_[0]; $self->goto_lm; } # ------------------------------------------------------------------------ sub adjust_rm { shift->{rm} += $_[0]; } 1; __END__