/usr/local/CPAN/perlSGML.1997Sep/SGML/StripParser.pm
##---------------------------------------------------------------------------##
## File:
## @(#) StripParser.pm 1.2 97/09/15 14:58:24 @(#)
## Author:
## Earl Hood ehood@medusa.acs.uci.edu
## Description:
## Class for stripping tags from SGML document instances.
##---------------------------------------------------------------------------##
## Copyright (C) 1997 Earl Hood, ehood@medusa.acs.uci.edu
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
## 02111-1307, USA
##---------------------------------------------------------------------------##
package SGML::StripParser;
use SGML::Parser;
@ISA = qw( SGML::Parser );
$VERSION = "0.01";
use SGML::ISO8859 qw( &sgml2str );
use SGML::Util;
##**********************************************************************##
## Public Methods
##**********************************************************************##
sub new {
my $this = new SGML::Parser;
my $class = shift;
$this->{'_stripout'}{'_fh'} = \*STDOUT;
$this->{'_stripout'}{'_html'} = 0;
$this->{'_stripout'}{'_url'} = '';
$this->{'_stripout'}{_charset} = '';
$this->{'_stripout'}{_ignents} = {};
$this->{'_stripout'}{_incents} = {};
bless $this, $class;
$this;
}
sub set_outhandle {
my $this = shift;
$this->{'_stripout'}{'_fh'} = shift;
}
sub set_html_mode {
my $this = shift;
$this->{'_stripout'}{'_html'} = shift;
}
sub set_charset {
my $this = shift;
$this->{'_stripout'}{_charset} = shift;
}
sub set_inc_parm_ents {
my $this = shift;
$this->{'_stripout'}{_incents}{@_} = ('INCLUDE') x scalar(@_);
}
sub set_ign_parm_ents {
my $this = shift;
$this->{'_stripout'}{_ignents}{@_} = ('IGNORE') x scalar(@_);
}
##**********************************************************************##
## Redefined SGML::Parser Callback Methods
##**********************************************************************##
sub cdata {
my $this = shift;
print { $this->{'_stripout'}{'_fh'} } $_[0];
}
sub char_ref {
my $this = shift;
my $val = shift;
my $str = '';
if ($this->{'_stripout'}{'_charset'}) {
$str = pack("C", int($val));
} else {
$str = defined($Entity{$val}) ? $Entity{$val} : "&#$val;";
}
print { $this->{'_stripout'}{'_fh'} } $str;
'';
}
sub comment_decl { }
sub end_tag {
my $this = shift;
my $gi = lc shift;
if ($this->{'_stripout'}{'_html'} and
$gi eq 'a' and
$this->{'_stripout'}{'_url'}) {
print { $this->{'_stripout'}{'_fh'} }
", <URL:$this->{'_stripout'}{'_url'}>, ";
$this->{'_stripout'}{'_url'} = '';
}
}
sub entity_ref {
my $this = shift;
my $name = shift;
my $str = '';
if ($this->{'_stripout'}{'_charset'}) {
$str = sgml2str("&$name;", $this->{'_stripout'}{'_charset'});
} else {
$str = defined($Entity{$name}) ? $Entity{$name} : "&$name;";
}
print { $this->{'_stripout'}{'_fh'} } $str;
'';
}
sub ignored_data { }
sub marked_sect_close { }
sub marked_sect_open { }
sub parm_entity_ref {
my $this = shift;
my $name = shift;
$this->{'_stripout'}{_ignents}{$name} ||
$this->{'_stripout'}{_incents}{$name};
}
sub processing_inst { }
sub start_tag {
my $this = shift;
my $gi = lc shift;
my $attr_spec = shift;
if ($this->{'_stripout'}{'_html'} and $gi eq 'a') {
my %attr = SGMLparse_attr_spec($attr_spec);
if ($attr{'href'}) {
($this->{'_stripout'}{'_url'} = $attr{'href'}) =~ s/#.*//;
}
}
}
##**********************************************************************##
## BEGIN block of module
##**********************************************************************##
BEGIN {
%Entity = (
#-----------------------------
# Numeric and Special Graphic
#-----------------------------
# Entity ASCII substitution
# ---------- ------------------
"half", "1/2", # fraction one-half
"frac12", "1/2",
"189", "1/2",
"frac14", "1/4", # fraction one-quarter
"188", "1/4",
"frac34", "3/4", # fraction three-quarters
"190", "3/4",
"frac18", "1/8", # fraction one-eighth
"frac38", "3/8", # fraction three-eighths
"frac58", "5/8", # fraction five-eighths
"frac78", "7/8", # fraction seven-eighths
"sup1", "[1]", # superscript one
"185", "[1]",
"sup2", "[2]", # superscript two
"178", "[2]",
"sup3", "[3]", # superscript three
"179", "[3]",
"plus", "+", # plus sign
"plusmn", "+/-", # plus-or-minus sign
"lt", "<", # less-than sign
"equals", "=", # equals sign
"gt", ">", # greater-than sign
"divide", "/", # divide sign
"times", "*", # times sign
"curren", "\$", # general currency sign
"164", "\$",
"pound", "L", # pound sign
"163", "L",
"dollar", "\$", # dollar sign
"cent", "c", # cent sign
"162", "c",
"yen", "Y", # yen sign
"165", "Y",
"num", "", # number sign
"percent", "%", # percent sign
"amp", "&", # ampersand
"ast", "*", # asterisk
"commat", "@", # commercial at
"lsqb", "[", # left square bracket
"bsol", "\\", # reverse solidus
"rsqb", "]", # right square bracket
"lcub", "{", # left curly bracket
"verbar", "|", # vertical bar
"rcub", "}", # right curly bracket
"copy", "(C)", # copyright sign
"169", "(C)",
"reg", "(R)", # registered sign
"174", "(R)",
"trade", "(TM)", # tradmark
"brvbar", "|", # broken vertical bar
"166", "|",
"excl", "!", # exclamation mark
"quot", '"', # quotation mark
"apos", "'", # apostrophe
"lpar", "(", # left parenthesis
"rpar", ")", # right parenthesis
"comma", ",", # comma
"lowbar", "_", # low line
"hyphen", "-", # hyphen
"period", ".", # full stop, period
"sol", "/", # solidus
"colon", ":", # colon
"semi", ";", # semicolon
"quest", "?", # question mark
"laquo", "<<", # angle quotation mark, left
"171", "<<",
"raquo", ">>", # angle quotation mark, right
"187", ">>",
"lsquo", "'", # single quotation mark, left
"rsquo", "'", # single quotation mark, right
"ldquo", '"', # double quotation mark, left
"rdquo", '"', # double quotation mark, right
"nbsp", " ", # no break (required) space
"160", " ",
"shy", "-", # soft hyphen
"173", "-",
#---------------
# Added Latin 1
#---------------
# Entity ASCII substitution
# ---------- ------------------
"aacute", "a", # small a, acute accent
"225", "a",
"Aacute", "A", # capital A, acute accent
"193", "A",
"acirc", "a", # small a, circumflex accent
"226", "a",
"Acirc", "A", # capital A, circumflex accent
"194", "A",
"agrave", "a", # small a, grave accent
"224", "a",
"Agrave", "A", # capital A, grave accent
"192", "A",
"aring", "a", # small a, ring
"229", "a",
"Aring", "A", # capital A, ring
"197", "A",
"atilde", "a", # small a, tilde
"227", "a",
"Atilde", "A", # capital A, tilde
"195", "A",
"auml", "a", # small a, dieresis or umlaut mark
"228", "a",
"Auml", "A", # capital A, dieresis or umlaut mark
"196", "A",
"aelig", "ae", # small ae diphthong (ligature)
"230", "ae",
"AElig", "AE", # capital AE diphthong (ligature)
"198", "AE",
"ccedil", "c", # small c, cedilla
"231", "c",
"Ccedil", "C", # capital C, cedilla
"199", "C",
"eth", "d", # small eth, Icelandic
"240", "d",
"ETH", "d", # capital Eth, Icelandic
"208", "d",
"eacute", "e", # small e, acute accent
"233", "e",
"Eacute", "E", # capital E, acute accent
"201", "E",
"ecirc", "e", # small e, circumflex accent
"234", "e",
"Ecirc", "E", # capital E, circumflex accent
"202", "E",
"egrave", "e", # small e, grave accent
"232", "e",
"Egrave", "E", # capital E, grave accent
"200", "E",
"euml", "e", # small e, dieresis or umlaut mark
"235", "e",
"Euml", "E", # capital E, dieresis or umlaut mark
"203", "E",
"iacute", "i", # small i, acute accent
"237", "i",
"Iacute", "I", # capital I, acute accent
"205", "I",
"icirc", "i", # small i, circumflex accent
"238", "i",
"Icirc", "I", # capital I, circumflex accent
"206", "I",
"igrave", "i", # small i, grave accent
"236", "i",
"Igrave", "I", # capital I, grave accent
"204", "I",
"iuml", "i", # small i, dieresis or umlaut mark
"239", "i",
"Iuml", "I", # capital I, dieresis or umlaut mark
"207", "I",
"ntilde", "n", # small n, tilde
"241", "n",
"Ntilde", "N", # capital N, tilde
"209", "N",
"oacute", "o", # small o, acute accent
"243", "o",
"Oacute", "O", # capital O, acute accent
"211", "O",
"ocirc", "o", # small o, circumflex accent
"244", "o",
"Ocirc", "O", # capital O, circumflex accent
"212", "O",
"ograve", "o", # small o, grave accent
"242", "o",
"Ograve", "O", # capital O, grave accent
"210", "O",
"oslash", "o", # small o, slash
"248", "o",
"Oslash", "O", # capital O, slash
"216", "O",
"otilde", "o", # small o, tilde
"245", "o",
"Otilde", "O", # capital O, tilde
"213", "O",
"ouml", "o", # small o, dieresis or umlaut mark
"246", "o",
"Ouml", "O", # capital O, dieresis or umlaut mark
"214", "O",
"szlig", "s", # small sharp s, German (sz ligature)
"223", "s",
"thorn", "p", # small thorn, Icelandic
"254", "p",
"THORN", "P", # capital THORN, Icelandic
"222", "P",
"uacute", "u", # small u, acute accent
"250", "u",
"Uacute", "U", # capital U, acute accent
"218", "U",
"ucirc", "u", # small u, circumflex accent
"251", "u",
"Ucirc", "U", # capital U, circumflex accent
"219", "U",
"ugrave", "u", # small u, grave accent
"249", "u",
"Ugrave", "U", # capital U, grave accent
"217", "U",
"uuml", "u", # small u, dieresis or umlaut mark
"252", "u",
"Uuml", "U", # capital U, dieresis or umlaut mark
"220", "U",
"yacute", "y", # small y, acute accent
"253", "y",
"Yacute", "Y", # capital Y, acute accent
"221", "Y",
"yuml", "y", # small y, dieresis or umlaut mark
"255", "y",
);
}
##---------------------------------------------------------------------------##
1;