/usr/local/CPAN/Alvis-Convert/Alvis/Wikipedia/WikitextParser.pm


package Alvis::Wikipedia::WikitextParser;

$Alvis::Wikipedia::WikitextParser::VERSION = '0.1';

use warnings;
use strict;

########################################################################
#
#  Exported constants
#
########################################################################
#
# Text section types
#
our ($MARKUP,$NOWIKI,$PRE,$MATH,$HIERO,$GALLERY)=
    ('wiki markup','nowiki','pre','math','hiero','gallery');

#######################################################################
#
#  Error message stuff
#
######################################################################

my ($ERR_OK,
    $ERR_UNK_SECTION_TYPE,
    $ERR_NO_TEXT,
    $ERR_UNDEF_TITLE,
    $ERR_ONLY_UNDERSCORES_LEFT,
    $ERR_RELATIVE_PATH,
    $ERR_OVERLONG_TITLE,
    $ERR_HTML_CONVERSION,
    $ERR_MARKUP_SEPARATION
    )=(0..8);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_UNK_SECTION_TYPE=>"Unrecognized text section type.",
	     $ERR_NO_TEXT=>"No text to separate markup",
	     $ERR_UNDEF_TITLE=>"Undefined title to normalize.",
	     $ERR_ONLY_UNDERSCORES_LEFT=>"Only underscores left in the " .
	     "title to normalize.",
	     $ERR_RELATIVE_PATH=>"Title to be normalized is a path relative " .
	     "to the working directory.",
	     $ERR_OVERLONG_TITLE=>"Title to be normalized is too long.",
	     $ERR_HTML_CONVERSION=>"HTML conversion failed.",
	     $ERR_MARKUP_SEPARATION=>"Separating the wikitext to different " .
	     "categories failed."
	     );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;


    if (!defined($errcode))
    {
	confess("set_err_state() called with an undefined argument.");
    }

    if (exists($ErrMsgs{$errcode}))
    {
	if ($errcode==$ERR_OK)
	{
	    $self->{errstr}="";
	}
	else
	{
	    $self->{errstr}.=" " . $ErrMsgs{$errcode};
	    if (defined($errmsg))
	    {
		$self->{errstr}.=" " . $errmsg;
	    }

	}
    }
    else
    {
	confess("Internal error: set_err_state() called with an " .
		"unrecognized argument ($errcode).")
    }
}

sub clearerr
{
    my $self=shift;
    
    $self->{errstr}="";
}

sub errmsg
{
    my $self=shift;
    
    return $self->{errstr};
}

####################################################################
#
#   Public methods
#
####################################################################

sub new
{
    my $proto=shift;

    my $class=ref($proto)||$proto;
    my $parent=ref($proto)&&$proto;
    my $self={};
    bless($self,$class);

    $self->_init(@_);

    return $self;
}

sub _init
{
    my $self=shift;

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

sub normalize_title
{
    my $self=shift;
    my $title=shift;

    if (!defined($title))
    {
	$self->_set_err_state($ERR_UNDEF_TITLE);
	return undef;
    }

    #
    # subst:, template:, msg: etc.
    #
    $title=~s/^\w*://isgo;

    #
    # Do a bunch of ill-documented & idiotic normalizations.
    # 50,000,000 flies can't be wrong.
    #
    #  space/underscore
    $title=~s/^\s+//isgo;
    $title=~s/\s+$//isgo;
    $title=~s/[ ]+/\_/isgo;
    $title=~s/\_+/\_/isgo;
    $title=~s/^\_+//isgo;
    $title=~s/\_+$//isgo;
    if ($title=~/^\_+$/)
    {
	$self->_set_err_state($ERR_ONLY_UNDERSCORES_LEFT);
	return undef;
    }

    if ($title=~/^\.\.?\/?$/)
    {
	$self->_set_err_state($ERR_RELATIVE_PATH);
	return undef;
    }

    $title=~s/\?/\%3F/isgo;
    $title=~s/\//\&\#47;/isgo;

    #
    # There's also a set of characters which "cannot occur" in 
    # titles, but no info on what to do with them...remove/fail?
    # I'll ignore for now.
    #

    if (length($title)>256)
    {
	$self->_set_err_state($ERR_OVERLONG_TITLE);
	return undef;
    }

    $title=ucfirst($title);

    return $title;
}

sub to_HTML
{
    my $self=shift;
    my $text=shift;

    my $HTML=$self->_convert($text);
    if (!defined($HTML))
    {
	$self->_set_err_state($ERR_HTML_CONVERSION);
	return undef;
    }

    return $HTML;
}

#
# This is no great shakes, a two-pass parser that first
# separates main types of text and then converts them
# according to type 
# Maybe should be replaced by a Wikitext parser library if one
# exists somewhere
sub _convert
{
    my $self=shift;
    my $text=shift;

    my $sep_text=$self->separate_markup($text);
    if (!defined($sep_text))
    {
	 $self->_set_err_state($ERR_MARKUP_SEPARATION," Text:$text");
	 return undef;
    }
    my $res="";
    for my $s (@$sep_text)
    {
	my ($type,$t)=@$s;

	if ($type eq $MARKUP)
	{
	    $t=$self->_convert_markup($t);
	}
	elsif ($type eq $NOWIKI)
	{
	    $t=$self->_convert_nowiki($t);
	}
	elsif ($type eq $PRE)
	{
	    $t=$self->_convert_pre($t);
	}
	elsif ($type eq $MATH)
	{
	    $t=$self->_convert_math($t);
	}
	elsif ($type eq $HIERO)
	{
	    $t=$self->_convert_hiero($t);
	}
	elsif ($type eq $GALLERY)
	{
	    $t=$self->_convert_gallery($t);
	}
	else
	{
	    $self->_set_err_state($ERR_UNK_SECTION_TYPE,
				  " Type:$type");
	}
	
	$res.=$t;
    }

    return $res;
}

sub separate_markup
{
    my $self=shift;
    my $text=shift;

    if (!defined($text))
    {
	$self->_set_err_state($ERR_NO_TEXT);
	return undef;
    }
    #
    # separate no-markup sections first
    #
    my @text=();
    while ($text=~/^(.*?)<nowiki>(.*?)<\/nowiki>(.*?)$/isgo)
    {
	push(@text,[$MARKUP,$1]);
	push(@text,[$NOWIKI,$2]);
	$text=$3;
    }
    push(@text,[$MARKUP,$text]);

    my @text2=();
    for my $t (@text)
    {
	my ($type,$text)=@$t;
	if ($type eq $MARKUP)
	{
	    while ($text=~/^(.*?)<math>(.*?)<\/math>(.*?)$/isgo)
	    {
		push(@text2,[$MARKUP,$1]);
		push(@text2,[$MATH,$2]);
		$text=$3;
	    }
	    push(@text2,[$MARKUP,$text]);
	}
	else
	{
	    push(@text2,[$type,$text]);
	}
    }
    
    my @text3=();
    for my $t (@text2)
    {
	my ($type,$text)=@$t;
	if ($type eq $MARKUP)
	{
	    while ($text=~/^(.*?)<hiero>(.*?)<\/hiero>(.*?)$/isgo)
	    {
		push(@text3,[$MARKUP,$1]);
		push(@text3,[$HIERO,$2]);
		$text=$3;
	    }
	    push(@text3,[$MARKUP,$text]);
	}
	else
	{
	    push(@text3,[$type,$text]);
	}
    }
    
    my @text4=();
    for my $t (@text3)
    {
	my ($type,$text)=@$t;
	if ($type eq $MARKUP)
	{
	    while ($text=~/^(.*?)<gallery>(.*?)<\/gallery>(.*?)$/isgo)
	    {
		push(@text4,[$MARKUP,$1]);
		push(@text4,[$GALLERY,$2]);
		$text=$3;
	    }
	    push(@text4,[$MARKUP,$text]);
	}
	else
	{
	    push(@text4,[$type,$text]);
	}
    }
    
    return \@text4;
}

sub _convert_nowiki
{
    my $self=shift;
    my $t=shift;
    
    return $t;
}

sub _convert_math
{
    my $self=shift;
    my $t=shift;
    
    return "<PRE>$t</PRE>";
}

sub _convert_hiero
{
    my $self=shift;
    my $t=shift;
    
    return "";
}

sub _convert_gallery
{
    my $self=shift;
    my $t=shift;
    
    my $t_g;
    my @rows=split(/\n/,$t);
    for my $r (@rows)
    {
	if ($r=~/^\s*Image:(.*)$/)
	{
	    $t_g.=$self->_handle_images($1);
	}
    }

    return $t_g;
}

sub _convert_template
{
    my $self=shift;
    my $t=shift;
    
    return $t;
}

sub _convert_markup
{
    my $self=shift;
    my $t=shift;

    my $res="";

    my @r=split(/\n/,$t);
    for (my $i=0;$i<scalar(@r);$i++)
    {
	if ($r[$i]=~/^[ ](.*)$/)
	{
	    $res.="<PRE>\n";
	    $res.="$1\n";
	    $i++;
	    while ($i<scalar(@r) && $r[$i]=~/^[ ](.*)$/)
	    {
		$res.="$1\n";
		$i++;
	    }
	    $res.="</PRE>\n";	    
	    if ($i<scalar(@r))
	    {
		$i--; # rewind
	    } 
	}
	elsif ($r[$i]=~/^=====(.*?)=====/)
	{
	     my $text=$self->_handle_text($1);
	    $res.="\n<H5>$text</H5>\n"
	}
	elsif ($r[$i]=~/^====(.*?)====/)
	{
	    my $text=$self->_handle_text($1);
	    $res.="\n<H4>$text</H4>\n"
	}
	elsif ($r[$i]=~/^===(.*?)===/)
	{
	    my $text=$self->_handle_text($1);
	    $res.="\n<H3>$text</H3>\n"
	}
	elsif ($r[$i]=~/^==(.*?)==/)
	{
	    my $text=$self->_handle_text($1);
	    $res.="\n<H2>$text</H2>\n"
	}
	elsif ($r[$i]=~/^([\*\#]+)(.*?)$/)
	{
	    my @stars=split(//,$1);
	    my $text=$self->_handle_text($2);
	    $res.="\n<UL>\n<LI>$text</LI>\n";
	    $i++;
	    my $nof_levels_open=1;
	    my $curr_level=scalar(@stars);
	    while ($i<scalar(@r) && $r[$i]=~/^([\*\#]+)(.*?)$/)
	    {
		@stars=split(//,$1);
		my $text=$self->_handle_text($2);
		my $level=scalar(@stars);
		if ($level>$curr_level)
		{
		    $res.="\n<UL>\n<LI>$text</LI>\n";
		    $nof_levels_open++;
		}
		elsif ($level==$curr_level)
		{
		    
		    $res.="\n<LI>$text</LI>\n";
		}
		else
		{
		    $res.="\n</UL>\n</UL>\n<UL>\n<LI>$text\n";
		    $nof_levels_open=$curr_level--;
		}
		$curr_level=$level;
		$i++;
	    }

	    while ($nof_levels_open--)
	    {
		$res.="\n</UL>\n";	
	    }
	    if ($i<scalar(@r))
	    {
		$i--; # rewind
	    } 
	}
	elsif ($r[$i]=~/^;(.*?):(.*)$/)
	{
	    my $term=$self->_handle_text($1);
	    my $definition=$self->_handle_text($2);
	    $res.="\n<DL>\n<DT>$term</DT>\n<DD>$definition</DD>\n</DL>\n"
	}
 	elsif ($r[$i]=~/^;([^:]*?)$/)
	{
	    my $term=$self->_handle_text($1);
	    $res.="\n<DL>\n<DT>$term</DT>\n";
	    $i++;
	    if ($i<scalar(@r) && $r[$i]=~/^:(.*)$/)
	    {
		my $definition=$self->_handle_text($1);
		$res.="<DD>$definition</DD>\n";
	    }
	    $res.="</DL>\n";
	}
 	elsif ($r[$i]=~/^:(.*?)$/)
	{
	    $res.="<DL><DD>";
	    my $text=$r[$i];
	    $text=~s/^://;
	    $res.=$self->_convert_markup($text);
	    $res.="</DD></DL>\n";
	}
	elsif ($r[$i]=~/^----/)
	{
	    $res.="\n<HR>\n";
	}
	elsif ($r[$i]=~/^$/)
	{
	    $res.="\n<P>\n";
	}
	elsif ($r[$i]=~/^\{\| (.*)$/)
	{
	    $res.="\n<TABLE $1>\n"
	}
	elsif ($r[$i]=~/^\|\}/)
	{
	    $res.="\n</TABLE>\n"
	}
	elsif ($r[$i]=~/^\|\+(?:.*)(?:\|(.*))$/)
	{
	    my $text=$self->_handle_text($1);
	    $res.="\n<CAPTION>$text</CAPTION>\n"
	}
	elsif ($r[$i]=~/^\|\-(.*?)$/)
	{
	    $res.="\n<TR $1>\n";
	}
	elsif ($r[$i]=~/^\!(.*?)(?:\|(.*))$/)
	{
	    my $text=$self->_handle_text($2);
	    $res.="\n<TH $1>$text</TH>\n";
	}
	elsif ($r[$i]=~/^\|(.*)$/)
	{
	    my $cells=$self->_handle_text($1);
	    my @cells=split(/\|\|/,$cells);
	    for my $c (@cells)
	    {
		my ($param,$cell)=split(/\|/,$c);
		if (!defined($cell))
		{
		    $cell=$param;
		    $param="";
		}
		if (defined($param) && defined($cell))
		{
		    $res.="\n<TD $param>$cell</TD>\n";
		}
	    }
	}
	else
	{
	    my $text=$self->_handle_text($r[$i]);
	    $res.="$text\n";
	}
    }

#    $res=$self->_handle_text($res);

    return $res;
}

sub _handle_text
{
    my $self=shift;
    my $txt=shift;
    
    #
    # This is not 100% correct, but the corresponding Parser.php
    # is hacky as h*ll as well and very complicated. Awesome language
    # design. 
    #
    $txt=~s/\'\'\'(.*?)\'\'\'/<STRONG>$1<\/STRONG>/isgo;
    $txt=~s/\'\'(.*?)\'\'/<EM>$1<\/EM>/isgo;

    #
    # References & footnotes
    #
    $txt=~s/<\s*references\s*\/\s*>//isgo;
    $txt=~s/<\s*ref\s*>/<OL><LI>/isgo;
    $txt=~s/<\s*\/ref\s*>/<\/LI><\/OL>/isgo;

    #
    #
    # Lame, but this is basically what Parser.php is forced to fall upon
    # as well. What you design is what you get.
    #
    $txt=~s/\[\[([^\[]*?)\]\]/$self->_handle_internal_link($1)/isgoe;
    $txt=~s/\[\[Image:(.*?)\]\]/$self->_handle_images($1)/isgoe;
    $txt=~s/\[([^\[]*?)\]/$self->_handle_external_link($1)/isgoe;
    
    return $txt;
}

sub _handle_images
{
    my $self=shift;
    my $link_txt=shift;

    my @parts=split(/\|/,$link_txt);

    if (scalar(@parts)==1)
    {
	my $link=$parts[0];
	$link=~s/ /_/sgo;
	return "<IMG src=\"wikipedia/images/$link\">$link</IMG>";
    }
    elsif (scalar(@parts)>=2)
    {
	my ($link,$title)=($parts[0],$parts[$#parts]);
 	$link=~s/ /_/sgo;
	my $title_txt=$self->_handle_text($title);
	if (!defined($title_txt))
	{
	    $title_txt="";
	}
	else
	{
	    $title_txt=~s/^\s+//sgo;
	    $title_txt=~s/\s+$//sgo;
	}
	return "<IMG src=\"wikipedia/images/$link\">$title_txt</IMG>";
    }
    else
    {
	return "[[$link_txt]]";
    }
}

sub _handle_external_link
{
    my $self=shift;
    my $link_txt=shift;

    if ($link_txt=~/^Image:/i)
    {
	return "[$link_txt]";
    }

    my @parts=split(/ /,$link_txt);
    if (scalar(@parts)>1)
    {
	my $text=join(" ",@parts[1..$#parts]);
	
 	$parts[0]=~s/ /_/sgo;
	$text=~s/^\s+//isgo;
	$text=~s/\s+$//isgo;
	return "<A href=\"$parts[0]\">$text</A>";
    }
    elsif (scalar(@parts)==1)
    {
        $parts[0]=~s/ /_/sgo; 
	return "<A href=\"$parts[0]\"></A>";
    }
    else
    {
	return "[$link_txt]"; # Probably sth is wrong...
    }
}

sub _handle_internal_link
{
    my $self=shift;
    my $link_txt=shift;

    if ($link_txt=~/^Image:/i)
    {
	return "[[$link_txt]]";
    }

    my @parts=split(/\|/,$link_txt);
    if (scalar(@parts)==1)
    {
	my $link=$parts[0];
	my $text=$link;
	$text=~s/^\s+//isgo;
	$text=~s/\s+$//isgo;
 	$link=~s/ /_/sgo;
	return "<A href=\"wikipedia/$link\">$text</A>";
    }
    elsif (scalar(@parts)==2)
    {
	my ($link,$title)=@parts[0..1];
	$title=$self->_handle_text($title);
 	$link=~s/ /_/sgo;
	$title=~s/^\s+//isgo;
	$title=~s/\s+$//isgo;
	return "<A href=\"wikipedia/$link\">$title</A>";
    }
    else
    {
	return "[[$link_txt]]";
    }
}

1;