| Text-Phonetic documentation | Contained in the Text-Phonetic distribution. |
Text::Phonetic::Phonix - Phonix algorithm
Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix has been incorporated into a number of WAIS implementations, including freeWAIS.
There seem to be two variants of the Phonix algorithm. One which also includes the first letter in the numeric code, and one that doesn't. This module is using the later variant.
Maroš Kollár
CPAN ID: MAROS
maros [at] k-1.com
http://www.k-1.com
Text::Phonetic::Phonix is Copyright (c) 2006,2007 Maroš. Kollár. All rights reserved.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
The full text of the license can be found in the LICENSE file included with this module.
| Text-Phonetic documentation | Contained in the Text-Phonetic distribution. |
# ============================================================================ package Text::Phonetic::Phonix; # ============================================================================ use utf8; use Moose; extends qw(Text::Phonetic); __PACKAGE__->meta->make_immutable; our $VERSION = $Text::Phonetic::VERSION; our $VOVEL = '[AEIOU]'; our $VOVEL_WITHY = '[AEIOUY]'; our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]'; our @VALUES = ( [qr/[AEIOUHWY]/,0], [qr/[BP]/,1], [qr/[CGJKQ]/,2], [qr/[DT]/,3], [qr/L/,4], [qr/[MN]/,5], [qr/R/,6], [qr/[FV]/,7], [qr/[SXZ]/,8], ); our @RULES = ( [qr/DG/,'G'], [qr/C([OAU])/,'K1'], [qr/C[YI]/,'SI'], [qr/CE/,'SE'], [qr/^CL($VOVEL)/,'KL1'], [qr/CK/,'K'], [qr/[GJ]C$/,'K'], [qr/^CH?R($VOVEL)/,'KR1'], [qr/^WR/,'R'], [qr/NC/,'NK'], [qr/CT/,'KT'], [qr/PH/,'F'], [qr/AA/,'AR'], #neu [qr/SCH/,'SH'], [qr/BTL/,'TL'], [qr/GHT/,'T'], [qr/AUGH/,'ARF'], [qr/(\w)LJ($VOVEL)/,'1LD2'], [qr/LOUGH/,'LOW'], [qr/^Q/,'KW'], [qr/^KN/,'N'], [qr/GN$/,'N'], [qr/GHN/,'N'], [qr/GNE$/,'N'], [qr/GHNE/,'NE'], [qr/GNES$/,'NS'], [qr/^GN/,'N'], [qr/(\w)GN($CONSONANT)/,'1N2'], [qr/^PS/,'S'], [qr/^PT/,'T'], [qr/^CZ/,'C'], [qr/($VOVEL)WZ(\w)/,'1Z2'], [qr/(\w)CZ(\w)/,'1CH2'], [qr/LZ/,'LSH'], [qr/RZ/,'RSH'], [qr/(\w)Z($VOVEL)/,'1S2'], [qr/ZZ/,'TS'], [qr/($CONSONANT)Z(\w)/,'1TS2'], [qr/HROUGH/,'REW'], [qr/OUGH/,'OF'], [qr/($VOVEL)Q($VOVEL)/,'1KW2'], [qr/($VOVEL)J($VOVEL)/,'1Y2'], [qr/^YJ($VOVEL)/,'Y1'], [qr/^GH/,'G'], [qr/($VOVEL)GH$/,'1E'], [qr/^CY/,'S'], [qr/NX/,'NKS'], [qr/^PF/,'F'], [qr/DT$/,'T'], [qr/(T|D)L$/,'1IL'], [qr/YTH/,'ITH'], [qr/^TS?J($VOVEL)/,'CH1'], [qr/^TS($VOVEL)/,'T1'], [qr/TCH/,'CH'], # old che [qr/^($VOVEL)WSK/,'1VSIKE'], [qr/^[PM]N($VOVEL)/,'N1'], [qr/($VOVEL)STL/,'1SL'], [qr/TNT$/,'ENT'], [qr/EAUX$/,'OH'], [qr/EXCI/,'ECS'], [qr/X/,'ECS'], [qr/NED$/,'ND'], [qr/JR/,'DR'], [qr/EE$/,'EA'], [qr/ZS/,'S'], [qr/($VOVEL)H?R($CONSONANT)/,'1AH2'], [qr/($VOVEL)HR$/,'1AH'], [qr/RE$/,'AR'], [qr/($VOVEL)R$/,'1AH'], [qr/LLE/,'LE'], [qr/($CONSONANT)LE(S?)$/,'1ILE2'], [qr/E$/,''], [qr/ES$/,'S'], [qr/($VOVEL)SS/,'1AS'], [qr/($VOVEL)MB$/,'1M'], [qr/MPTS/,'MPS'], [qr/MPS/,'MS'], [qr/MPT/,'MT'], ); #sub _do_compare { # my $obj = shift; # my $result1 = shift; # my $result2 = shift; # # # Main values are different # return 0 unless ($result1->[0] eq $result2->[0]); # # # Ending values the same # return 75 if ($result1->[1] eq $result2->[1]); # # # Ending values differ in length, and are same for the shorter # my $length1 = length $result1->[1]; # my $length2 = length $result2->[1]; # if ($length1 > $length2 # && $length1 - $length2 == 1) { # return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]); # }elsif ($length2 > $length1 # && $length2 - $length1 == 1) { # return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]); # } # # return 25; #} #The algorithm always returns either a scalar value or an array reference with #two elements. The fist element represents the sound of the name without the #ending sound, and the second element represents the ending sound. To get a #full representation of the name you need to concat the two elements. # #If you want to compare two names the following rules apply: # #=over # #=item * If the ending sound values of an entered name and a retrieved name are #the same, the retrieved name is a LIKELY candidate. # #=item * If an entered name has an ending-sound value, and the retrieved name #does not, then the retrieved name is a LEAST-LIKELY candidate. # #=item * If the two ending-sound values are the same for the length of the #shorter, and the difference in length between the two ending-sound is one #digit only, then the retrieved name isa LESS-LIKELY candidate. # #=item * All other cases result in LEAST-LIKELY candidates. # #=back sub _do_encode { my ($self,$string) = @_; my ($original_string, $first_char); $original_string = $string; # To uppercase and remove other characters $string = uc($string); $string =~ tr/A-Z//cd; # RULE 1: Replcace rule foreach my $rule (@RULES) { my $regexp = $rule->[0]; my $replace = $rule->[1]; $string =~ s/$regexp/_replace($replace,$1,$2)/ge; } # RULE 2: Fetch first character $first_char = substr($string,0,1,''); # RULE 3: Exceptions for first character rule if (grep { $first_char eq $_ } qw(A E I O U Y)) { $first_char = 'v'; $string =~ s/^$VOVEL_WITHY//; } elsif ($first_char eq 'W' || $first_char eq 'H') { #$string =~ s/^[WH]//; } # RULE 4 $string =~ s/ES$/S/; # RULE 5 $string =~ s/($VOVEL_WITHY)$/$1E/; # RULE 6 #$string =~ s/\w$//; # This rule seems kind of strict # RULE 7-8 # if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) { # # RULE 13 # $last_string = _transform($2); # } # RULE 9-11 $string = _transform($string); # RULE 12 $string = $first_char.$string; #$string .= $last_string if (defined $last_string); $string .= '0' x (8-length $string); $string = substr($string,0,8); return $string; } sub _transform { my $string = shift; return unless defined $string; # RULE 9 $string =~ s/([AEIOUYHW])//g; # RULE 10 $string =~ s/($CONSONANT+)\1/$1/g; # RULE 11 foreach my $value (@VALUES) { my $regexp = $value->[0]; $string =~ s/$regexp/$value->[1]/g; } return $string; } sub _replace { my $replace = shift; my $pos1 = shift; my $pos2 = shift; $replace =~ s/1/$pos1/ if (defined $pos1); $replace =~ s/2/$pos2/ if (defined $pos2); return $replace; } 1;