| Lingua-RU-Detect documentation | Contained in the Lingua-RU-Detect distribution. |
Lingua::RU::Detect - Heuristics for guessing encoding sequence
use Lingua::RU::Detect "detect_enc";
say Dumper(detect_enc("бНОПНЯ"));
say Dumper(detect_enc("бОДТЕК"));
Lingua::RU::Detect make a guess of how the original text was reconverted with a sequence of different encodings.
This module is a heart of http://decodr.ru/ website which provides a tool for automatic recovering Russian texts which were damaged by multiple transcodings. Two and three item chains are now available to detect, and the speed is much higher than that of programmes based on a dictionary.
The result of calling detect_enc subroutine is a list of encoding pairs. To get original UTF-8 string you need to make all these transcodings in the order specified in the array returned. For example:
$VAR1 = [ [ 'UTF-8', 'ISO-8859-5' ], [ 'KOI8-R', 'UTF-8' ] ];
If no reencoding is needed, result is an empty array.
For test suite refer to Wikipedia page http://ru.wikipedia.org/wiki/%D0%9A%D1%80%D0%BE%D0%BA%D0%BE%D0%B7%D1%8F%D0%B1%D1%80%D1%8B (not all of them pass current version).
Andrew Shitov, <andy@shitov.ru>
Lingua::RU::Detect module is a free software. You may redistribute and (or) modify it under the same terms as Perl 5.10.
| Lingua-RU-Detect documentation | Contained in the Lingua-RU-Detect distribution. |
package Lingua::RU::Detect; use vars qw ($VERSION); $VERSION = '1.1'; use strict; use utf8; require Exporter; our @ISA = qw(Exporter); our @EXPORT_OK = qw(detect_enc); my %patterns = ( '-|'. 'UTF-8.KOI8-R,CP1251.KOI8-R,CP1251.UTF-8', '\b(?:[Ð-Я][а-Ñ ]+|[а-Ñ]{3,})\b', 'UTF-8.CP1251,KOI8-R.UTF-8|'. 'UTF-8.KOI8-R,CP1251.UTF-8', '\b(?:[а-Ñ][Ð-Я]+|[Ð-Я]{3,})\b', 'UTF-8.ISO-8859-5,KOI8-R.UTF-8|'. 'UTF-8.KOI8-R,ISO-8859-5.UTF-8|'. 'UTF-8.CP1251,ISO-8859-5.UTF-8', '[а-Ñ]+[Ð-Я]+[а-Ñ]+|[Ð-Я]+[а-Ñ]+[Ð-Я]+', 'UTF-8.ISO-8859-5,CP1251.UTF-8|'. 'UTF-8.CP866,CP1251.UTF-8', '[а-ÑÑÑÑÑÑÑÑ]+[âÐÐÐÐâ]{1,2}[а-ÑÑÑÑÑÑÑÑ]+\b', 'UTF-7.UTF-8', '(?:^|\s)\+B[B-F][0-9a-zA-Z]+', 'UTF-7.CP866', '(?:^|\s)\+JWg[a-zA-Z]+', 'UTF-8.ISO-8859-1,UTF-7.ISO-8859-1', '(?:^|\s)\+AN[a-zA-Z]+', 'UTF-8.CP1251,KOI8-R.CP866', 'Ñ.¶|¶.¶', 'UTF-8.CP1252,KOI8-R.CP866', 'º.º|¶.º', 'UTF-8.CP1251,UTF-8.CP866', 'вâ¢Ð|â¢âºÐ²â¢', 'UTF-8.KOI8-R,UTF-8.CP866', 'Ðââ|Ðâ.Ðâ.', 'UTF-8.ISO-8859-1', 'ú|ÿÃ', 'UTF-8.CP1251,UTF-8.ISO-8859-1', 'ÐÑÐ|°Ðâ', 'UTF-8.KOI8-R,UTF-8.ISO-8859-1', 'Ñâб', 'UTF-8.CP1251,UTF-8.UTF-16', 'нâ.нâ|нâ .нâ ', 'UTF-8.KOI8-R,UTF-8.UTF-16', 'â¬Ð|âªÐ|â£Ð|â«Ð', 'UTF-8.CP1251,CP866.UTF-8', '®.+Ò|«Ð|®¤Â®|«Ò', 'UTF-8.CP1251,UTF-8.KOI8-R', 'Ð Ñв|СÐ', 'UTF-8.ISO-8859-1,ISO-8859-5.UTF-8', '[ýëåêòðèôèêà öèÿãóáåðÃèé]+[ÃÃà ÃÃÃÃÃÃÃÃÃßÃÃÃà ÃÃÃÃ]+[ýëåêòðèôèêà öèÿãóáåðÃèé]+|[ÃÃà ÃÃÃÃÃÃÃÃÃßÃÃÃà ÃÃÃÃ]+[ýëåêòðèôèêà öèÿãóáåðÃèé]+[ÃÃà ÃÃÃÃÃÃÃÃÃßÃÃÃà ÃÃÃÃ]+', 'UTF-8.CP1252,CP1251.UTF-8', '[ýëåêòðèôèêà öèÿãóáåðÃèé]{3,}', 'UTF-8.CP1252,KOI8-R.UTF-8', '[ÃÃÃÃÃÃÃÃÃÃÃà Ã]{2,}', 'UTF-8.CP866,KOI8-R.UTF-8', '[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]{4,}', 'UTF-8.KOI8-R,CP866.UTF-8', '[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+[Ð-Я]+[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+[Ð-Я]+[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+', 'UTF-8.CP866,ISO-8859-5.UTF-8', '[а-оÑ-Ñ]+[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+[а-оÑ-Ñ]+[ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+', 'UTF-8.KOI8-R', '[пÑ][ââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©][пÑ]|п.Ñ', 'UTF-8.CP866', '[ââââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+[Ð-Я][ââââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+[Ð-Я][ââââ§â«â£ââââââ¥â§â¦â´â¤â â¼â¦ââ¥ââââ¦â´âââ¤âââ¬ââ âââ¬â¼â¥â¬ââ©]+', ); my %ambiguities = ( '-' => 'на|ÑÑ|ни|но|ан|ов|ко|Ñо|ен|ле|ел|Ñа|Ñе|иÑ|по|ом|Ñо|еÑ|ва|Ñа|оÑ|Ñе|де|лÑ|еÑ|он|ÑÑ|за|Ñк|оÑ|ли|аÑ|ол|об|аÑ|од|ие|го|пÑ|Ñи|мо|ам|Ñл|ÑÑ|не|оÑ|ла|ал|иÑ', 'UTF-8.KOI8-R,CP1251.KOI8-R,CP1251.UTF-8' => 'лÑ|Ñп|лÑ|лм|Ñл|ма|им|пм|ел|йе|ей|оÑ|ое|ÑÑ|нм|мк|ом|еп|аÑ|пÑ|мÑ|пе|де|йÑ|ео|мл|пÑ|ÑÑ|Ñи|мп|йÑ|Ñп|мй|мÑ|Ñо|мд|Ñе|жм|но|оÑ|км|Ñк|Ñй|по|ле|мо|йÑ|Ñй|Ñп', 'UTF-8.CP1251,KOI8-R.UTF-8' => 'ÐÐ|УФ|ÐÐ|ÐÐ|ÐÐ|ÐЧ|ÐÐ|ФÐ|ÐÐ|ÐÐ|ÐÐ|ТÐ|ТÐ|ÐУ|Ð Ð|ÐÐ|ТÐ|ÐФ|ЧÐ|ФÐ|ÐУ|ФÐ|ÐÐ|ÐШ|ÐТ|ÐÐ|ФШ|ЪÐ|УÐ|ÐФ|ÐÐ|ÐФ|ÐÐ|ÐÐ|ÐТ|ÐÐ|ÐÐ|ÐÐ|РТ|ТÐ|ÐÐ|ÐÐ|УÐ|ФТ|ÐÐ|ÐТ|ÐÐ|ÐÐ|ÐФ', 'UTF-8.KOI8-R,CP1251.UTF-8' => 'ÐЮ|ЯР|ÐÐ¥|ÐÐ|ЮÐ|ÐÐ|ÐÐ|Ð Ð|ÐÐ|ÐÐ|ÐÐ|ÐЮ|ÐÐ|ХЯ|ÐÐ|ÐÐ|ÐÐ|ÐÐ |ÐЮ|РЮ|ÐЯ|Ð Ð|ÐÐ|ÐÐ|ÐÐ|ÐÐ|Ð Ð|ÐЮ|ЯÐ|ÐÐ |ÐÐ¥|ЮР|ÐÐ|ÐÐ|ЮÐ|ÐÐ|Ð¥Ð|ЦÐ|ÐÐ|ÐÐ¥|ÐÐ|ЮÐ|ЯÐ|Ð Ð|ÐÐ|ÐÐ|ÐЮ|ЮÐ|ХР', 'UTF-8.ISO-8859-5,KOI8-R.UTF-8' => 'ЮС|гд|ЮЩ|ЮЯ|СЮ|Яз|ЫЯ|дЯ|ХЮ|ЬХ|ХЬ|вС|вХ|Щг|аЯ|ЯÐ|вЯ|Хд|зС|дС|Яг|дХ|ФХ|Ьи|Хв|ЯЮ|ди|кС|гЫ|Яд|ЬЩ|Сд|ЯЬ|ЯТ|Св|ЯФ|ЩХ|ЧЯ|ав|вЩ|ÐЯ|СÐ|гЬ|дв|ЮХ|Яв|ЬС|СЬ|Щд', 'UTF-8.KOI8-R,ISO-8859-5.UTF-8' => 'Ñп|ÐÐ|ÑÑ|ÑÑ|пÑ|ÑÑ|зÑ|ÐÑ|ÑÑ|ÑÑ|ÑÑ|Юп|ЮÑ|ÑÐ|ÑÑ|ÑÑ|ЮÑ|ÑÐ|Ñп|Ðп|ÑÐ|ÐÑ|ÑÑ|ÑÐ|ÑЮ|ÑÑ|ÐÐ|вп|Ðз|ÑÐ|ÑÑ|пÐ|ÑÑ|ÑÑ|пЮ|ÑÑ|ÑÑ|ÑÑ|ÑЮ|ЮÑ|ÑÑ|пÑ|ÐÑ|ÐЮ|ÑÑ|ÑЮ|Ñп|пÑ|ÑÐ', 'UTF-8.CP1251,ISO-8859-5.UTF-8' => 'ÐÐ |бв|ÐШ|ÐЮ|Ð Ð|ЮТ|ЪЮ|вЮ|Ð¥Ð|ЫХ|ХЫ|аР|аХ|Шб|ЯЮ|ЮЬ|аЮ|Хв|ТР|вР|Юб|вХ|ФХ|Ым|Ха|ЮÐ|вм|ЧР|бЪ|Юв|ЫШ|Рв|ЮЫ|ЮС|Ра|ЮФ|ШХ|УЮ|Яа|аШ|ЬЮ|РЬ|бЫ|ва|ÐÐ¥|Юа|ЫР|РЫ|Шв', 'UTF-8.ISO-8859-5,CP1251.UTF-8' => 'ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|Ñ Ñ|ÑÑ |Ñ Ñ|âÑ|âÑ |ÑÑ|ÑÑ|ÑÑ|âÑ|Ñ Ñ|ÑÑ|ÑÑ|ÑÑ|ÑÑ |ÑÑ |ÑÑ|Ñ â|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|Ñâ|ÑÑ|ÑÑ |ÑÑ|Ñâ|âÑ|ÑÑ|ÑÑ|ÑÑ|Ñâ|ÑÑ |Ñâ|ÑÑ|ÑÑ|ÑÑ', 'UTF-8.CP866,CP1251.UTF-8' => 'ÑÑ|ÑÐ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÑÑ|ÐÑ|Ñ Ñ|ÑÑ |Ñ Ñ|ÐÑ|ÐÑ |ÑÑ|ÑÑ|ÑÑ|ÐÑ|Ñ Ð|ÑÑ|ÐÑ|ÑÑ|ÐÑ |ÑÑ |Ñâ|Ñ Ð|ÑÑ|Ðâ|ÑÑ|ÑÑ|ÑÐ|ÑÑ|ÑÐ|ÑÑ|ÑÑ|ÑÐ|ÑÑ|ÑÑ |ÑÑ|ÑÐ|ÐÑ|ÑÑ|ÑÑ|ÑÑ|ÐÐ|ÑÑ |ÑÐ|ÑÑ|ÑÑ|ÑÐ', ); sub detect_enc { my $string = shift; my %variants = (); for my $path (sort keys %patterns) { $variants{$path} = () = $string =~ /$patterns{$path}/g; } my $path = scalar keys %variants ? (sort {$variants{$a} <=> $variants{$b}} keys %variants)[-1] : ''; $path = remove_ambiguity($path, $string) if $path =~ m{\|}; return make_list($path); } sub remove_ambiguity { my $paths = shift; my $text = shift; my @paths = split m{\|}, $paths; my %stats = (); for my $path (@paths) { $stats{$path} = () = $text =~ /$ambiguities{$path}/g; } return scalar keys %stats ? (sort {$stats{$a} <=> $stats{$b}} keys %stats)[-1] : $paths[0]; } sub make_list { my $path = shift; my @ret; for my $pair (split /,/, $path) { my ($from, $to) = split /\./, $pair; push @ret, [$from, $to] unless $from eq '-'; } return @ret; } 1; __END__