| Acme-Shukugawa-Atom documentation | Contained in the Acme-Shukugawa-Atom distribution. |
Acme::Shukugawa::Atom - ギロッポンにテッペンでバミった
use Acme::Shukugawa::Atom;
my $newstring = Acme::Shukugawa::Atom->translate($string);
# By default, share/config.yaml is used (via File::ShareDir) for custom
# fixed replacements. You can however override this by specifying the
# alternate config filename at load time
BEGIN
{
$Acme::Shukugawa::Atom::CONFIG = '/path/to/config.yaml';
}
use Acme::Shukugawa::Atom;
# Or you can specify them (on top of the default words) at run time
my $atom = Acme::Shukugawa::Atom->new(
# The default values are stored in @Acme::Shukugawa::Atom::DEFAULT_WORDS
custom_words => [
'regexp1' => 'replacement1'
'regexp2' => 'replacement2'
'regexp3' => 'replacement3'
'regexp4' => 'replacement4'
....
]
);
my $newstring = $atom->translate($string);
# shorter way
my $newstring = Acme::Shukugawa::Atom->translate($string,
custom_words => [ ... ]
);
夙川アトム風な文章を作成します。
まだまだ足りない部分がありますので、もしよければt/01_basic.tに希望する変換前と 変換後の結果を書いてテストをアップデートしてお知らせください。変換を 可能にするようにコードを修正してみます。
svnが使える方はこちらからどうぞ:
http://svn.coderepos.org/share/lang/perl/Acme-Shukugawa-Atom/trunk
板付き語(固定変換)を足す場合はインスタンス毎にcustom_wordsを変更するか、 share/config.yaml のcustom_wordsに追加してください。
Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp>
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
See http://www.perl.com/perl/misc/Artistic.html
| Acme-Shukugawa-Atom documentation | Contained in the Acme-Shukugawa-Atom distribution. |
# $Id: /mirror/coderepos/lang/perl/Acme-Shukugawa-Atom/trunk/lib/Acme/Shukugawa/Atom.pm 47728 2008-03-14T01:07:28.622095Z daisuke $ package Acme::Shukugawa::Atom; use strict; use warnings; use base qw(Class::Accessor::Fast); use utf8; use Encode qw(decode_utf8); use File::ShareDir; use Text::MeCab; use YAML (); our $VERSION = '0.00004'; __PACKAGE__->mk_accessors($_) for qw(custom_words); # Special case handling -- this could be optimized further # put it in a sharefile later our ($CONFIG, @DEFAULT_WORDS, $RE_EXCEPTION, $RE_SMALL, $RE_SYLLABLE, $RE_NBAR); BEGIN { my $config = YAML::LoadFile( $CONFIG || File::ShareDir::module_file(__PACKAGE__, 'config.yaml') ); $RE_SMALL = decode_utf8("[ã£ã¥ã§ãã¼]"); $RE_SYLLABLE = decode_utf8("(?:.$RE_SMALL?)"); $RE_NBAR = decode_utf8("^ã³ã¼"); @DEFAULT_WORDS = map { (decode_utf8($_->[0]), decode_utf8($_->[1])) } @{ $config->{custom_words} || [] }; } sub _create_exception_re { my $self = shift; my $custom = $self->custom_words; return decode_utf8(join("|", map { $custom->[$_ * 2 + 1] } (0..(scalar(@$custom) - 1)/2) )); } sub translate { my $self = shift; my $string = decode_utf8(shift); if (! ref $self) { $self = $self->new({ custom_words => \@DEFAULT_WORDS, @_ }); } # Create local RE_EXCEPTION local $RE_EXCEPTION = $self->_create_exception_re; $self->preprocess(\$string); $self->runthrough(\$string); $self->postprocess(\$string); return $string; } sub preprocess { my ($self, $strref) = @_; my $custom = $self->custom_words; for(0..(scalar(@$custom) - 1)/2) { my $pattern = $custom->[$_ * 2]; my $replace = $custom->[$_ * 2 + 1]; $$strref =~ s/$pattern/$replace/g; } } sub runthrough { my ($self, $strref) = @_; my $mecab = Text::MeCab->new; # First, make it all katakana, except for where the surface is already # in hiragana my $ret = ''; foreach my $text (split(/($RE_EXCEPTION|\s+)/, $$strref)) { if ($text =~ /$RE_EXCEPTION/) { $ret .= $text; next; } if ($text !~ /\S/) { $ret .= $text; next; } foreach (my $node = $mecab->parse($text); $node; $node = $node->next) { next unless $node->surface; my $surface = decode_utf8($node->surface); my $feature = decode_utf8($node->feature); my ($type, $yomi) = (split(/,/, $feature))[0,8]; # warn "$surface -> $type, $yomi"; if ($surface eq '䏿') { $ret .= 'ãã¤ã¦ã¼'; next; } if ($type eq 'åè©' && $node->next) { # å©åè©ãè¨ç®ã«å ¥ãã my $next_feature = decode_utf8($node->next->feature); my ($next_type, $next_yomi) = (split(/,/, $next_feature))[0,8]; if ($next_type eq 'å©åè©') { $yomi .= $next_yomi; $node = $node->next; } } if ($type =~ /å¯è©|å©åè©|形容è©|æ¥ç¶è©|å©è©/ && $surface =~ /^\p{InHiragana}+$/) { $ret .= $surface; } elsif ($yomi) { $ret .= $self->atomize($yomi) || $surface; } else { $ret .= $surface; } } } $$strref = $ret; } sub postprocess {} # ã·ã¼ã¹ã¼ã«ã¼ã« # 寿å¸âã·ã¼ã¹ã¼ # ã³ããæå¾ã ã£ããã²ã£ããè¿ããªã sub apply_shisu_rule { my ($self, $yomi) = @_; return $yomi if $yomi =~ s{^($RE_SYLLABLE)($RE_SYLLABLE)$}{ my ($a, $b) = ($1, $2); $a =~ s/ã¼$//; $b =~ s/ã¼$//; "${b}ã¼${a}ã¼"; }e; return; } # ã¯ã¤ãã¼ã«ã¼ã« # ãã¯ã¤âã¯ã¤ãã¼ sub apply_waiha_rule { my ($self, $yomi) = @_; # warn "WAIHA $yomi"; if ($yomi =~ s/^(${RE_SYLLABLE}[$RE_NBAR]?)([^$RE_NBAR].)$/$2$1/) { $yomi =~ s/(^.[^ã¼].*[^ã¼])$/$1ã¼/; return $yomi; } return; } # ã¯ãªããã«ã¼ã« # ã³ã£ããâã¯ãªãã sub apply_kuribitsu_rule { my ($self, $yomi) = @_; # warn "KURIBITSU $yomi"; if ($yomi =~ s/^(${RE_SYLLABLE}.)([^$RE_NBAR]${RE_SYLLABLE}$)/$2$1/) { return $yomi; } return; } sub atomize { my ($self, $yomi) = @_; $yomi =~ s/ã¼+/ã¼/g; # Length my $word_length = length($yomi); my $length = $word_length - ($yomi =~ /$RE_SMALL/g); if ($length == 3 && $yomi =~ s/^(${RE_SYLLABLE})ã/${1}ã/) { # warn "Special rule!"; $length = 4; } my $done = 0; # warn "$yomi LENGTH: $length"; if ($length == 2) { my $tmp = $self->apply_shisu_rule($yomi); if ($tmp) { $yomi = $tmp; $done = 1; } } if ($length == 3) { my $tmp = $self->apply_waiha_rule($yomi); if ($tmp) { $yomi = $tmp; $done = 1; } } if ($length == 4) { # 4 character words tend to have special xformation my $tmp = $self->apply_kuribitsu_rule($yomi); if ($tmp) { $yomi = $tmp; $done = 1; } } if (! $done) { $yomi =~ s/(.(?:ã¼+)?)$//; $yomi = $1 . $yomi; } $yomi =~ s/ã$/ã/; return $yomi; } 1; __END__