| Text-TinySegmenter documentation | Contained in the Text-TinySegmenter distribution. |
Text::TinySegmenter - Super compact Japanese tokenizer
use Text::TinySegmenter; my @words = Text::TinySegmenter->segment($string);
Perl version of TinySegmenter.
It is available at: http://chasen.org/~taku/software/TinySegmenter/
Segments string into words.
Jiro Nishiguchi <jiro@cpan.org>
And
Taku Kudo <taku@chasen.org>
Copyright (c) 2008, Taku Kudo
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <ORGANIZATION> nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
| Text-TinySegmenter documentation | Contained in the Text-TinySegmenter distribution. |
package Text::TinySegmenter; use 5.8.1; use strict; use warnings; use utf8; our $VERSION = '0.01'; my %Patterns = ( "[ä¸äºä¸åäºå ä¸å «ä¹åç¾åä¸åå ]" => "M", "[ä¸-é¾ ã ããµã¶]" => "H", "[ã-ã]" => "I", "[ã¡-ã´ã¼ï½±-ï¾ï¾ï½°]" => "K", "[a-zA-Zï½-ï½ï¼¡-Z]" => "A", "[0-9ï¼-ï¼]" => "N", ); my @CharType; { while (my ($key, $val) = each %Patterns) { push @CharType, [qr/$key/, $val]; } } my $BIAS = -332; my %BC1 = ("HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378); my %BC2 = ("AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920); my %BC3 = ("HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266); my %BP1 = ("BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352); my %BP2 = ("BO" => 60,"OO" => -1762); my %BQ1 = ("BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965); my %BQ2 = ("BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146); my %BQ3 = ("BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699); my %BQ4 = ("BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973); my %BW1 = (",ã¨" => 660,",å" => 727,"B1ã" => 1404,"B1å" => 542,"ãã¨" => 660,"ãå" => 727,"ãã¨" => 1682,"ãã£" => 1505,"ãã" => 1743,"ãã£" => -2055,"ãã" => 672,"ãã" => -4817,"ãã" => 665,"ãã" => 3472,"ãã" => 600,"ãã" => -790,"ãã¨" => 2083,"ãã" => -1262,"ãã" => -4143,"ãã" => 4573,"ãã" => 2641,"ãã¦" => 1104,"ãã§" => -3399,"ãã" => 1977,"ãã" => -871,"ãã¡" => 1122,"ãã" => 601,"ã£ã" => 3463,"ã¤ã" => -802,"ã¦ã" => 805,"ã¦ã" => 1249,"ã§ã" => 1127,"ã§ã" => 3445,"ã§ã¯" => 844,"ã¨ã" => -4915,"ã¨ã¿" => 1922,"ã©ã" => 3887,"ãªã" => 5713,"ãªã£" => 3015,"ãªã©" => 7379,"ãªã" => -1113,"ã«ã" => 2468,"ã«ã¯" => 1498,"ã«ã" => 1671,"ã«å¯¾" => -912,"ã®ä¸" => -501,"ã®ä¸" => 741,"ã¾ã" => 2448,"ã¾ã§" => 1711,"ã¾ã¾" => 2600,"ã¾ã" => -2155,"ãã" => -1947,"ãã£" => -2565,"ãã" => 2369,"ãã§" => -913,"ãã" => 1860,"ãè¦" => 731,"亡ã" => -1886,"京é½" => 2558,"åã" => -2784,"大ã" => -2604,"大éª" => 1497,"å¹³æ¹" => -2314,"å¼ã" => -1336,"æ¥æ¬" => -195,"æ¬å½" => -2423,"æ¯æ¥" => -2113,"ç®æ" => -724,"ï¼¢ï¼ã" => 1404,"ï¼¢ï¼å" => 542,"ï½£ã¨" => 1682); my %BW2 = (".." => -11822,"11" => -669,"ââ" => -5730,"ââ" => -13175,"ãã" => -1609,"ãã" => 2490,"ãã" => -1350,"ãã" => -602,"ãã" => -7194,"ãã" => 4612,"ãã" => 853,"ãã" => -3198,"ãã" => 1941,"ããª" => -1597,"ãã¨" => -8392,"ãã®" => -4193,"ãã" => 4533,"ãã" => 13168,"ãã" => -3977,"ãã" => -1819,"ãã" => -545,"ãã" => 5078,"ãã¦" => 972,"ããª" => 939,"ãã®" => -3744,"ãã" => -1253,"ãã" => -662,"ãã " => -3857,"ãã¡" => -786,"ãã¨" => 1224,"ãã¯" => -939,"ã£ã" => 4589,"ã£ã¦" => 1647,"ã£ã¨" => -2094,"ã¦ã" => 6144,"ã¦ã" => 3640,"ã¦ã" => 2551,"ã¦ã¯" => -3110,"ã¦ã" => -3065,"ã§ã" => 2666,"ã§ã" => -1528,"ã§ã" => -3828,"ã§ã" => -4761,"ã§ã" => -4203,"ã¨ã" => 1890,"ã¨ã" => -1746,"ã¨ã¨" => -2279,"ã¨ã®" => 720,"ã¨ã¿" => 5168,"ã¨ã" => -3941,"ãªã" => -2488,"ãªã" => -1313,"ãªã©" => -6509,"ãªã®" => 2614,"ãªã" => 3099,"ã«ã" => -1615,"ã«ã" => 2748,"ã«ãª" => 2454,"ã«ã" => -7236,"ã«å¯¾" => -14943,"ã«å¾" => -4688,"ã«é¢" => -11388,"ã®ã" => 2093,"ã®ã§" => -7059,"ã®ã«" => -6041,"ã®ã®" => -6125,"ã¯ã" => 1073,"ã¯ã" => -1033,"ã¯ã" => -2532,"ã°ã" => 1813,"ã¾ã" => -1316,"ã¾ã§" => -6621,"ã¾ã" => 5409,"ãã¦" => -3153,"ãã" => 2230,"ãã®" => -10713,"ãã" => -944,"ãã" => -1611,"ãã«" => -1897,"ãã" => 651,"ãã¾" => 1620,"ãã" => 4270,"ãã¦" => 849,"ãã°" => 4114,"ãã" => 6067,"ãã" => 7901,"ãé" => -11877,"ãã " => 728,"ããª" => -4115,"ä¸äºº" => 602,"䏿¹" => -1375,"䏿¥" => 970,"ä¸é¨" => -1051,"ä¸ã" => -4479,"ä¼ç¤¾" => -1116,"åºã¦" => 2163,"åã®" => -7758,"åå " => 970,"忥" => -913,"大éª" => -2471,"å§å¡" => -1250,"å°ãª" => -1050,"年度" => -8669,"å¹´é" => -1626,"åºç" => -2363,"ææ¨©" => -1982,"æ°è" => -4066,"æ¥æ°" => -722,"æ¥æ¬" => -7068,"æ¥ç±³" => 3372,"ææ¥" => -601,"æé®®" => -2355,"æ¬äºº" => -2697,"æ±äº¬" => -1543,"ç¶ã¨" => -1384,"社ä¼" => -1276,"ç«ã¦" => -990,"第ã«" => -1612,"ç±³å½" => -4268,"ï¼ï¼" => -669); my %BW3 = ("ãã" => -2194,"ãã" => 719,"ãã" => 3846,"ã." => -1185,"ãã" => -1185,"ãã" => 5308,"ãã" => 2079,"ãã" => 3029,"ãã" => 2056,"ãã£" => 1883,"ãã" => 5600,"ãã" => 1527,"ãã¡" => 1117,"ãã¨" => 4798,"ãã¨" => 1454,"ã." => 2857,"ãã" => 2857,"ãã" => -743,"ãã£" => -4098,"ãã«" => -669,"ãã" => 6520,"ãã" => -2670,"ã," => 1816,"ãã" => 1816,"ãã" => -4855,"ãã" => -1127,"ãã£" => -913,"ãã" => -4977,"ãã" => -2064,"ãã" => 1645,"ãã©" => 1374,"ãã¨" => 7397,"ãã®" => 1542,"ãã" => -2757,"ãã" => -714,"ãã" => 976,"ã," => 1557,"ãã" => 1557,"ãã" => -3714,"ãã" => 3562,"ãã¦" => 1449,"ããª" => 2608,"ãã¾" => 1200,"ã." => -1310,"ãã" => -1310,"ãã" => 6521,"ã," => 3426,"ãã" => 3426,"ãã«" => 841,"ãã" => 428,"ã." => 8875,"ãã" => 8875,"ãã" => -594,"ãã®" => 812,"ãã" => -1183,"ãã" => -853,"ã ." => 4098,"ã ã" => 4098,"ã ã£" => 1004,"ã£ã" => -4748,"ã£ã¦" => 300,"ã¦ã" => 6240,"ã¦ã" => 855,"ã¦ã" => 302,"ã§ã" => 1437,"ã§ã«" => -1482,"ã§ã¯" => 2295,"ã¨ã" => -1387,"ã¨ã" => 2266,"ã¨ã®" => 541,"ã¨ã" => -3543,"ã©ã" => 4664,"ãªã" => 1796,"ãªã" => -903,"ãªã©" => 2135,"ã«," => -1021,"ã«ã" => -1021,"ã«ã" => 1771,"ã«ãª" => 1906,"ã«ã¯" => 2644,"ã®," => -724,"ã®ã" => -724,"ã®å" => -1000,"ã¯," => 1337,"ã¯ã" => 1337,"ã¹ã" => 2181,"ã¾ã" => 1113,"ã¾ã" => 6943,"ã¾ã£" => -1549,"ã¾ã§" => 6154,"ã¾ã" => -793,"ãã" => 1479,"ãã" => 6820,"ãã" => 3818,"ã," => 854,"ãã" => 854,"ãã" => 1850,"ãã¦" => 1375,"ãã°" => -3246,"ãã" => 1091,"ãã" => -605,"ãã " => 606,"ãã§" => 798,"ã«æ" => 990,"ä¼è°" => 860,"å ¥ã" => 1232,"大ä¼" => 2217,"å§ã" => 1681,"å¸" => 965,"æ°è" => -5055,"æ¥," => 974,"æ¥ã" => 974,"社ä¼" => 2024,"ï½¶æ" => 990); my %TC1 = ("AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832); my %TC2 = ("HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649); my %TC3 = ("AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393); my %TC4 = ("HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841); my %TQ1 = ("BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68); my %TQ2 = ("BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591); my %TQ3 = ("BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685); my %TQ4 = ("BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156); my %TW1 = ("ã«ã¤ã" => -4681,"æ±äº¬é½" => 2026); my %TW2 = ("ããç¨" => -2049,"ãã£ã" => -1256,"ããã" => -2434,"ããã" => 3873,"ãã®å¾" => -4430,"ã ã£ã¦" => -1049,"ã¦ãã" => 1833,"ã¨ãã¦" => -4657,"ã¨ãã«" => -4517,"ãã®ã§" => 1882,"䏿°ã«" => -792,"åãã¦" => -1512,"åæã«" => -8097,"大ããª" => -1255,"対ãã¦" => -2721,"社ä¼å " => -3216); my %TW3 = ("ããã " => -1734,"ãã¦ã" => 1314,"ã¨ãã¦" => -4314,"ã«ã¤ã" => -5483,"ã«ã¨ã£" => -5989,"ã«å½ã" => -6247,"ã®ã§," => -727,"ã®ã§ã" => -727,"ã®ãã®" => -600,"ããã" => -3752,"åäºæ" => -2287); my %TW4 = ("ãã." => 8576,"ããã" => 8576,"ãããª" => -2348,"ãã¦ã" => 2958,"ãã," => 1516,"ããã" => 1516,"ã¦ãã" => 1538,"ã¨ãã" => 1349,"ã¾ãã" => 5543,"ã¾ãã" => 1097,"ããã¨" => -4258,"ããã¨" => 5865); my %UC1 = ("A" => 484,"K" => 93,"M" => 645,"O" => -505); my %UC2 = ("A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646); my %UC3 = ("A" => -1370,"I" => 2311); my %UC4 = ("A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646); my %UC5 = ("H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831); my %UC6 = ("H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387); my %UP1 = ("O" => -214); my %UP2 = ("B" => 69,"O" => 935); my %UP3 = ("B" => 189); my %UQ1 = ("BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422); my %UQ2 = ("BH" => 216,"BI" => 113,"OK" => 1759); my %UQ3 = ("BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212); my %UW1 = ("," => 156,"ã" => 156,"ã" => -463,"ã" => -941,"ã" => -127,"ã" => -553,"ã" => 121,"ã" => 505,"ã§" => -201,"ã¨" => -547,"ã©" => -123,"ã«" => -789,"ã®" => -185,"ã¯" => -847,"ã" => -466,"ã" => -470,"ã" => 182,"ã" => -292,"ã" => 208,"ã" => 169,"ã" => -446,"ã" => -137,"ã»" => -135,"主" => -402,"京" => -268,"åº" => -912,"å" => 871,"å½" => -460,"大" => 561,"å§" => 729,"å¸" => -411,"æ¥" => -141,"ç" => 361,"ç" => -408,"ç" => -386,"é½" => -718,"ï½¢" => -463,"ï½¥" => -135); my %UW2 = ("," => -829,"ã" => -829,"ã" => 892,"ã" => -645,"ã" => 3145,"ã" => -538,"ã" => 505,"ã" => 134,"ã" => -502,"ã" => 1454,"ã" => -856,"ã" => -412,"ã" => 1141,"ã" => 878,"ã" => 540,"ã" => 1529,"ã" => -675,"ã" => 300,"ã" => -1011,"ã" => 188,"ã " => 1837,"ã¤" => -949,"ã¦" => -291,"ã§" => -268,"ã¨" => -981,"ã©" => 1273,"ãª" => 1063,"ã«" => -1764,"ã®" => 130,"ã¯" => -409,"ã²" => -1273,"ã¹" => 1261,"ã¾" => 600,"ã" => -1263,"ã" => -402,"ã" => 1639,"ã" => -579,"ã" => -694,"ã" => 571,"ã" => -2516,"ã" => 2095,"ã¢" => -587,"ã«" => 306,"ã" => 568,"ã" => 831,"ä¸" => -758,"ä¸" => -2150,"ä¸" => -302,"ä¸" => -968,"主" => -861,"äº" => 492,"人" => -123,"ä¼" => 978,"ä¿" => 362,"å ¥" => 548,"å" => -3025,"å¯" => -1566,"å" => -3414,"åº" => -422,"大" => -1769,"天" => -865,"太" => -483,"å" => -1519,"å¦" => 760,"å®" => 1023,"å°" => -2009,"å¸" => -813,"å¹´" => -1060,"å¼·" => 1067,"æ" => -1519,"æº" => -1033,"æ¿" => 1522,"æ" => -1355,"æ°" => -1682,"æ¥" => -1815,"æ" => -1462,"æ" => -630,"æ" => -1843,"æ¬" => -1650,"æ±" => -931,"æ" => -665,"次" => -2378,"æ°" => -180,"æ°" => -1740,"ç" => 752,"çº" => 529,"ç®" => -1584,"ç¸" => -242,"ç" => -1165,"ç«" => -763,"第" => 810,"ç±³" => 509,"èª" => -1353,"è¡" => 838,"西" => -744,"è¦" => -3874,"調" => 1010,"è°" => 1198,"è¾¼" => 3041,"é" => 1758,"é" => -1257,"ï½¢" => -645,"ï½£" => 3145,"ッ" => 831,"ï½±" => -587,"ï½¶" => 306,"ï½·" => 568); my %UW3 = ("," => 4889,"1" => -800,"â" => -1723,"ã" => 4889,"ã " => -2311,"ã" => 5827,"ã" => 2670,"ã" => -3573,"ã" => -2696,"ã" => 1006,"ã" => 2342,"ã" => 1983,"ã" => -4864,"ã" => -1163,"ã" => 3271,"ã" => 1004,"ã" => 388,"ã" => 401,"ã" => -3552,"ã" => -3116,"ã" => -1058,"ã" => -395,"ã" => 584,"ã" => 3685,"ã" => -5228,"ã" => 842,"ã¡" => -521,"ã£" => -1444,"ã¤" => -1081,"ã¦" => 6167,"ã§" => 2318,"ã¨" => 1691,"ã©" => -899,"ãª" => -2788,"ã«" => 2745,"ã®" => 4056,"ã¯" => 4555,"ã²" => -2171,"ãµ" => -1798,"ã¸" => 1199,"ã»" => -5516,"ã¾" => -4384,"ã¿" => -120,"ã" => 1205,"ã" => 2323,"ã" => -788,"ã" => -202,"ã" => 727,"ã" => 649,"ã" => 5905,"ã" => 2773,"ã" => -1207,"ã" => 6620,"ã" => -518,"ã¢" => 551,"ã°" => 1319,"ã¹" => 874,"ã" => -1350,"ã" => 521,"ã " => 1109,"ã«" => 1591,"ã" => 2201,"ã³" => 278,"ã»" => -3794,"ä¸" => -1619,"ä¸" => -1759,"ä¸" => -2087,"両" => 3815,"ä¸" => 653,"主" => -758,"äº" => -1193,"äº" => 974,"人" => 2742,"ä»" => 792,"ä»" => 1889,"以" => -1368,"ä½" => 811,"ä½" => 4265,"ä½" => -361,"ä¿" => -2439,"å " => 4858,"å " => 3593,"å ¨" => 1574,"å ¬" => -3030,"å " => 755,"å ±" => -1880,"å" => 5807,"å" => 3095,"å" => 457,"å" => 2475,"å¥" => 1129,"å" => 2286,"å¯" => 4437,"å" => 365,"å" => -949,"å" => -1872,"å" => 1327,"å" => -1038,"åº" => 4646,"å" => -2309,"å" => -783,"å" => -1006,"å£" => 483,"å³" => 1233,"å" => 3588,"å" => -241,"å" => 3906,"å" => -837,"å¡" => 4513,"å½" => 642,"å" => 1389,"å ´" => 1219,"å¤" => -241,"妻" => 2016,"å¦" => -1356,"å®" => -423,"å®" => -1008,"å®¶" => 1078,"å°" => -513,"å°" => -3102,"å·" => 1155,"å¸" => 3197,"å¹³" => -1804,"å¹´" => 2416,"åº" => -1030,"åº" => 1605,"度" => 1452,"建" => -2352,"å½" => -3885,"å¾" => 1905,"æ" => -1291,"æ§" => 1822,"æ¸" => -488,"æ" => -3973,"æ¿" => -2013,"æ" => -1479,"æ°" => 3222,"æ" => -1489,"æ°" => 1764,"æ¥" => 2099,"æ§" => 5792,"æ¨" => -661,"æ" => -1248,"æ" => -951,"æ" => -937,"æ" => 4125,"æ" => 360,"æ" => 3094,"æ" => 364,"æ±" => -805,"æ ¸" => 5156,"森" => 2438,"æ¥" => 484,"æ°" => 2613,"æ°" => -1694,"決" => -1073,"æ³" => 1868,"æµ·" => -495,"ç¡" => 979,"ç©" => 461,"ç¹" => -3850,"ç" => -273,"ç¨" => 914,"çº" => 1215,"ç" => 7313,"ç´" => -1835,"ç" => 792,"ç" => 6293,"ç¥" => -1528,"ç§" => 4231,"ç¨" => 401,"ç«" => -960,"第" => 1201,"ç±³" => 7767,"ç³»" => 3066,"ç´" => 3663,"ç´" => 1384,"çµ±" => -4229,"ç·" => 1163,"ç·" => 1255,"è " => 6457,"è½" => 725,"èª" => -2869,"è±" => 785,"è¦" => 1044,"調" => -562,"財" => -733,"è²»" => 1777,"è»" => 1835,"è»" => 1375,"è¾¼" => -1504,"é" => -1136,"é¸" => -681,"é" => 1026,"é¡" => 4404,"é¨" => 1200,"é" => 2163,"é·" => 421,"é" => -1432,"é" => 1302,"é¢" => -1282,"é¨" => 2009,"é»" => -1045,"é" => 2066,"é§ " => 1620,"ï¼" => -800,"ï½£" => 2670,"ï½¥" => -3794,"ッ" => -1350,"ï½±" => 551,"クï¾" => 1319,"ï½½" => 874,"ï¾" => 521,"ï¾" => 1109,"ï¾" => 1591,"ï¾" => 2201,"ï¾" => 278); my %UW4 = ("," => 3930,"." => 3508,"â" => -4841,"ã" => 3930,"ã" => 3508,"ã" => 4999,"ã" => 1895,"ã" => 3798,"ã" => -5156,"ã" => 4752,"ã" => -3435,"ã" => -640,"ã" => -2514,"ã" => 2405,"ã" => 530,"ã" => 6006,"ã" => -4482,"ã" => -3821,"ã" => -3788,"ã" => -4376,"ã" => -4734,"ã" => 2255,"ã" => 1979,"ã" => 2864,"ã" => -843,"ã" => -2506,"ã" => -731,"ã" => 1251,"ã" => 181,"ã" => 4091,"ã" => 5034,"ã " => 5408,"ã¡" => -3654,"ã£" => -5882,"ã¤" => -1659,"ã¦" => 3994,"ã§" => 7410,"ã¨" => 4547,"ãª" => 5433,"ã«" => 6499,"ã¬" => 1853,"ã" => 1413,"ã®" => 7396,"ã¯" => 8578,"ã°" => 1940,"ã²" => 4249,"ã³" => -4134,"ãµ" => 1345,"ã¸" => 6665,"ã¹" => -744,"ã»" => 1464,"ã¾" => 1051,"ã¿" => -2082,"ã" => -882,"ã" => -5046,"ã" => 4169,"ã" => -2666,"ã" => 2795,"ã" => -1544,"ã" => 3351,"ã" => -2922,"ã" => -9726,"ã" => -14896,"ã" => -2613,"ã" => -4570,"ã" => -1783,"ã" => 13150,"ã" => -2352,"ã«" => 2145,"ã³" => 1789,"ã»" => 1287,"ã" => -724,"ã" => -403,"ã¡" => -1635,"ã©" => -881,"ãª" => -541,"ã«" => -856,"ã³" => -3637,"ã»" => -4371,"ã¼" => -11870,"ä¸" => -2069,"ä¸" => 2210,"äº" => 782,"äº" => -190,"äº" => -1768,"人" => 1036,"以" => 544,"ä¼" => 950,"ä½" => -1286,"ä½" => 530,"å´" => 4292,"å " => 601,"å " => -2006,"å ±" => -1212,"å " => 584,"å" => 788,"å" => 1347,"å" => 1623,"å¯" => 3879,"å" => -302,"å" => -740,"å" => -2715,"å" => 776,"åº" => 4517,"å" => 1013,"å" => 1555,"å" => -1834,"å" => -681,"å¡" => -910,"å¨" => -851,"å" => 1500,"å½" => -619,"å" => -1200,"å°" => 866,"å ´" => -1410,"å¡" => -2094,"士" => -1413,"å¤" => 1067,"大" => 571,"å" => -4802,"å¦" => -1397,"å®" => -1057,"寺" => -809,"å°" => 1910,"å±" => -1328,"å±±" => -1500,"å³¶" => -2056,"å·" => -2667,"å¸" => 2771,"å¹´" => 374,"åº" => -4556,"å¾" => 456,"æ§" => 553,"æ" => 916,"æ" => -1566,"æ¯" => 856,"æ¹" => 787,"æ¿" => 2182,"æ" => 704,"æ" => 522,"æ¹" => -856,"æ¥" => 1798,"æ" => 1829,"æ" => 845,"æ" => -9066,"æ¨" => -485,"æ¥" => -442,"æ ¡" => -360,"æ¥" => -1043,"æ°" => 5388,"æ°" => -2716,"æ°" => -910,"æ²¢" => -939,"æ¸" => -543,"ç©" => -735,"ç" => 672,"ç" => -1267,"ç" => -1286,"ç£" => -1101,"ç°" => -2900,"çº" => 1826,"ç" => 2586,"ç®" => 922,"ç" => -3485,"ç" => 2997,"空" => -867,"ç«" => -2112,"第" => 788,"ç±³" => 2937,"ç³»" => 786,"ç´" => 2171,"çµ" => 1146,"çµ±" => -1169,"ç·" => 940,"ç·" => -994,"ç½²" => 749,"è " => 2145,"è½" => -730,"è¬" => -852,"è¡" => -792,"è¦" => 792,"è¦" => -1184,"è°" => -244,"è°·" => -1000,"è³" => 730,"è»" => -1481,"è»" => 1158,"輪" => -1433,"è¾¼" => -3370,"è¿" => 929,"é" => -1291,"é¸" => 2596,"é" => -4866,"é½" => 1192,"é" => -1100,"é" => -2213,"é·" => 357,"é" => -2344,"é¢" => -2297,"é" => -2604,"é»" => -878,"é " => -1659,"é¡" => -792,"館" => -1984,"é¦" => 1749,"é«" => 2120,"ï½¢" => 1895,"ï½£" => 3798,"ï½¥" => -4371,"ッ" => -724,"ï½°" => -11870,"ï½¶" => 2145,"コ" => 1789,"ï½¾" => 1287,"ï¾" => -403,"ï¾" => -1635,"ï¾" => -881,"ï¾" => -541,"ï¾" => -856,"ï¾" => -3637); my %UW5 = ("," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"ã" => 465,"ã" => -299,"ã" => 363,"ã" => 1655,"ã" => 331,"ã" => -503,"ã" => 1199,"ã" => 527,"ã" => 647,"ã" => -421,"ã" => 1624,"ã" => 1971,"ã" => 312,"ã" => -983,"ã" => -1537,"ã" => -1371,"ã" => -852,"ã " => -1186,"ã¡" => 1093,"ã£" => 52,"ã¤" => 921,"ã¦" => -18,"ã§" => -850,"ã¨" => -127,"ã©" => 1682,"ãª" => -787,"ã«" => -1224,"ã®" => -635,"ã¯" => -578,"ã¹" => 1001,"ã¿" => 502,"ã" => 865,"ã" => 3350,"ã" => 854,"ã" => -208,"ã" => 429,"ã" => 504,"ã" => 419,"ã" => -1264,"ã" => 327,"ã¤" => 241,"ã«" => 451,"ã³" => -343,"ä¸" => -871,"京" => 722,"ä¼" => -1153,"å " => -654,"å" => 3519,"åº" => -901,"å" => 848,"å¡" => 2104,"大" => -1296,"å¦" => -548,"å®" => 1785,"åµ" => -1304,"å¸" => -2991,"å¸" => 921,"å¹´" => 1763,"æ" => 872,"æ" => -814,"æ" => 1618,"æ°" => -1682,"æ¥" => 218,"æ" => -4353,"æ»" => 932,"æ ¼" => 1356,"æ©" => -1508,"æ°" => -1347,"ç°" => 240,"çº" => -3912,"ç" => -3149,"ç¸" => 1319,"ç" => -1052,"ç" => -4003,"ç " => -997,"社" => -278,"空" => -813,"çµ±" => 1955,"è " => -2233,"表" => 663,"èª" => -1073,"è°" => 1219,"é¸" => -1018,"é" => -368,"é·" => 786,"é" => 1191,"é¡" => 2368,"館" => -689,"ï¼" => -514,"ï¼¥ï¼" => -32768,"ï½¢" => 363,"ï½²" => 241,"ï¾" => 451,"ï¾" => -343); my %UW6 = ("," => 227,"." => 808,"1" => -270,"E1" => 306,"ã" => 227,"ã" => 808,"ã" => -307,"ã" => 189,"ã" => 241,"ã" => -73,"ã" => -121,"ã" => -200,"ã" => 1782,"ã" => 383,"ã" => -428,"ã£" => 573,"ã¦" => -1014,"ã§" => 101,"ã¨" => -105,"ãª" => -253,"ã«" => -149,"ã®" => -417,"ã¯" => -236,"ã" => -206,"ã" => 187,"ã" => -135,"ã" => 195,"ã«" => -673,"ã³" => -496,"ä¸" => -277,"ä¸" => 201,"ä»¶" => -800,"ä¼" => 624,"å" => 302,"åº" => 1792,"å¡" => -1212,"å§" => 798,"å¦" => -960,"å¸" => 887,"åº" => -695,"å¾" => 535,"æ¥" => -697,"ç¸" => 753,"社" => -507,"ç¦" => 974,"空" => -822,"è " => 1811,"é£" => 463,"é" => 1082,"ï¼" => -270,"ï¼¥ï¼" => 306,"ï¾" => -673,"ï¾" => -496); sub _ctype { my $str = shift; for my $type (@CharType) { if ($str =~ $type->[0]) { return $type->[1]; } } return "O"; } sub _ts { $_[0] || 0; } sub segment { my ($class, $input) = @_; if (!defined $input || $input eq '') { return wantarray ? () : []; } my @result; my @seg = ("B3","B2","B1"); my @ctype = ("O","O","O"); my @o = split //, $input; for my $c (@o) { push @seg, $c; push @ctype, _ctype($c); } push @seg, "E1"; push @seg, "E2"; push @seg, "E3"; push @ctype, "O"; push @ctype, "O"; push @ctype, "O"; my $word = $seg[3]; my $p1 = "U"; my $p2 = "U"; my $p3 = "U"; for (my $i = 4; $i < @seg - 3; ++$i) { my $score = $BIAS; my $w1 = $seg[$i-3]; my $w2 = $seg[$i-2]; my $w3 = $seg[$i-1]; my $w4 = $seg[$i]; my $w5 = $seg[$i+1]; my $w6 = $seg[$i+2]; my $c1 = $ctype[$i-3]; my $c2 = $ctype[$i-2]; my $c3 = $ctype[$i-1]; my $c4 = $ctype[$i]; my $c5 = $ctype[$i+1]; my $c6 = $ctype[$i+2]; $score += _ts($UP1{$p1}); $score += _ts($UP2{$p2}); $score += _ts($UP3{$p3}); $score += _ts($BP1{$p1 . $p2}); $score += _ts($BP2{$p2 . $p3}); $score += _ts($UW1{$w1}); $score += _ts($UW2{$w2}); $score += _ts($UW3{$w3}); $score += _ts($UW4{$w4}); $score += _ts($UW5{$w5}); $score += _ts($UW6{$w6}); $score += _ts($BW1{$w2 . $w3}); $score += _ts($BW2{$w3 . $w4}); $score += _ts($BW3{$w4 . $w5}); $score += _ts($TW1{$w1 . $w2 . $w3}); $score += _ts($TW2{$w2 . $w3 . $w4}); $score += _ts($TW3{$w3 . $w4 . $w5}); $score += _ts($TW4{$w4 . $w5 . $w6}); $score += _ts($UC1{$c1}); $score += _ts($UC2{$c2}); $score += _ts($UC3{$c3}); $score += _ts($UC4{$c4}); $score += _ts($UC5{$c5}); $score += _ts($UC6{$c6}); $score += _ts($BC1{$c2 . $c3}); $score += _ts($BC2{$c3 . $c4}); $score += _ts($BC3{$c4 . $c5}); $score += _ts($TC1{$c1 . $c2 . $c3}); $score += _ts($TC2{$c2 . $c3 . $c4}); $score += _ts($TC3{$c3 . $c4 . $c5}); $score += _ts($TC4{$c4 . $c5 . $c6}); #$score += _ts($TC5{$c4 . $c5 . $c6}); $score += _ts($UQ1{$p1 . $c1}); $score += _ts($UQ2{$p2 . $c2}); $score += _ts($UQ1{$p3 . $c3}); $score += _ts($BQ1{$p2 . $c2 . $c3}); $score += _ts($BQ2{$p2 . $c3 . $c4}); $score += _ts($BQ3{$p3 . $c2 . $c3}); $score += _ts($BQ4{$p3 . $c3 . $c4}); $score += _ts($TQ1{$p2 . $c1 . $c2 . $c3}); $score += _ts($TQ2{$p2 . $c2 . $c3 . $c4}); $score += _ts($TQ3{$p3 . $c1 . $c2 . $c3}); $score += _ts($TQ4{$p3 . $c2 . $c3 . $c4}); my $p = "O"; if ($score > 0) { push @result, $word; $word = ""; $p = "B"; } $p1 = $p2; $p2 = $p3; $p3 = $p; $word .= $seg[$i]; } push @result, $word; return wantarray ? @result : \@result; } __END__