| HTML-Split documentation | Contained in the HTML-Split distribution. |
HTML::Split - Splitting HTML by number of characters with keeping DOM structure.
use HTML::Split; my $html = <<HTML; <div class="pkg"> <h1>HTML::Split</h1> <p>Splitting HTML by number of characters.</p> </div> HTML; my @pages = HTML::Split->split(html => $html, length => 50); # $pages[0] <div class="pkg"> # <h1>HTML::Split</h1> # <p>Splittin</p></div> # $pages[1] <div class="pkg"> # <p>g HTML by number of characters.</p></div>
HTML::Split is the module to split HTML by number of characters with keeping DOM structure.
In some mobile devices, mainly cell-phones, because the data size that can be acquired with HTTP is limited, it is necessary to split HTML.
This module provide the method of splitting HTML without destroying the DOM tree for such devices.
Split HTML text by number of characters. It can accept below parameters with hash.
HTML string.
The length (characters) per pages.
This will be deprecated. Please use HTML::Split::Pager instead.
Hiroshi Sakai <ziguzagu@cpan.org>
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| HTML-Split documentation | Contained in the HTML-Split distribution. |
package HTML::Split; use strict; use warnings; use 5.008001; our $VERSION = '0.04'; use Encode; use HTML::Parser; my %_is_empty_tag = map { $_ => 1 } qw( br hr img br/ hr/ ); sub split { my $class = shift; my %param = @_; my $html = $param{html} or return; my $max_length = $param{length} or return ($html); my $extend_tags = $param{extend_tags} || []; my $is_utf8 = Encode::is_utf8($html); Encode::_utf8_on($html) unless $is_utf8; return ( $param{html} ) if length $html <= $max_length; my (@pages, @tags, $last_tag, $forwarded_tags); my $page = ''; my $find_end_tag = ''; ## page generator my $create_page = sub { # append unclosed tags forwarded from previous page to beginning of page. $page = $forwarded_tags . $page if $forwarded_tags; # append unclosed tags to the end of page. $page .= join '', map { '</'.$_->{tagname}.'>' } reverse @tags; return unless $page; push @pages, $page; $forwarded_tags = join '', map { $_->{text} } @tags; $page = ''; }; my $start_tag_handler = sub { my ($p, $tagname, $text) = @_; if ($find_end_tag) { unless ($_is_empty_tag{$tagname}) { push @tags, $last_tag = { tagname => $tagname, text => $text }; } $page .= $text; return; } $page .= $text if $_is_empty_tag{$tagname}; if (length $page.$text > $max_length && !$find_end_tag) { $create_page->(); } unless ($_is_empty_tag{$tagname}) { push @tags, $last_tag = { tagname => $tagname, text => $text }; $page .= $text; } $find_end_tag = $tagname if $tagname eq 'a'; }; my $end_tag_handler = sub { my ($p, $tagname, $text) = @_; return unless $last_tag && $last_tag->{tagname} eq $tagname; pop @tags; $last_tag = $tags[-1]; $page .= $text; $find_end_tag = '' if $find_end_tag eq $tagname; if (length $page > $max_length && !$find_end_tag) { $create_page->(); } }; my $default_handler = sub { my ($p, $text) = @_; my $src = $page . $text; if ($find_end_tag) { $page = $src; return; } while (length $src > $max_length) { $page = substr $src, 0, $max_length; ## find indivisible extend tag my $over = 0; for my $tag (@$extend_tags) { my $full_re = $tag->{full} or next; my $begin_re = $tag->{begin} or next; my $end_re = $tag->{end} or next; if (my ($first) = $page =~ /($begin_re)$/) { my $next = substr $src, $max_length; if (my ($second) = $next =~ /^($end_re)/) { my $may_have_tag = $first.$second; if ($may_have_tag =~ /^$full_re$/) { $page .= $second; $over = length $second; } } } } $create_page->(); $src = substr $src, $max_length + $over; } $page = $src; }; my $p = HTML::Parser->new( api_version => 3, start_h => [ $start_tag_handler, "self,tagname,text", ], end_h => [ $end_tag_handler, "self,tagname,text", ], default_h => [ $default_handler, "self,text", ], ); $p->parse($html); $p->eof; $create_page->(); unless ($is_utf8) { Encode::_utf8_off($_) for @pages; } return @pages; } sub new { my $class = shift; my %param = @_; warn "This method will be depricated. Please use HTML::Split::Pager->new instead."; require HTML::Split::Pager; return HTML::Split::Pager->new(%param); } 1; __END__