Text::Phonetic::Phonix - Phonix algorithm


Text-Phonetic documentation Contained in the Text-Phonetic distribution.

Index


Code Index:

NAME

Top

Text::Phonetic::Phonix - Phonix algorithm

DESCRIPTION

Top

Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix has been incorporated into a number of WAIS implementations, including freeWAIS.

There seem to be two variants of the Phonix algorithm. One which also includes the first letter in the numeric code, and one that doesn't. This module is using the later variant.

AUTHOR

Top

    Maroš Kollár
    CPAN ID: MAROS
    maros [at] k-1.com
    http://www.k-1.com

COPYRIGHT

Top

SEE ALSO

Top


Text-Phonetic documentation Contained in the Text-Phonetic distribution.

# ============================================================================
package Text::Phonetic::Phonix;
# ============================================================================
use utf8;

use Moose;
extends qw(Text::Phonetic);

__PACKAGE__->meta->make_immutable;

our $VERSION = $Text::Phonetic::VERSION;

our $VOVEL = '[AEIOU]';
our $VOVEL_WITHY = '[AEIOUY]';
our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]';

our @VALUES = (
    [qr/[AEIOUHWY]/,0],
    [qr/[BP]/,1],
    [qr/[CGJKQ]/,2],
    [qr/[DT]/,3],
    [qr/L/,4],
    [qr/[MN]/,5],
    [qr/R/,6],
    [qr/[FV]/,7],
    [qr/[SXZ]/,8],
);

our @RULES = (
    [qr/DG/,'G'],
    [qr/C([OAU])/,'K1'],
    [qr/C[YI]/,'SI'],
    [qr/CE/,'SE'],
    [qr/^CL($VOVEL)/,'KL1'],
    [qr/CK/,'K'],
    [qr/[GJ]C$/,'K'],
    [qr/^CH?R($VOVEL)/,'KR1'],
    [qr/^WR/,'R'],
    [qr/NC/,'NK'],
    [qr/CT/,'KT'],
    [qr/PH/,'F'],
    [qr/AA/,'AR'], #neu
    [qr/SCH/,'SH'],
    [qr/BTL/,'TL'],
    [qr/GHT/,'T'],
    [qr/AUGH/,'ARF'],
    [qr/(\w)LJ($VOVEL)/,'1LD2'],
    [qr/LOUGH/,'LOW'],
    [qr/^Q/,'KW'],
    [qr/^KN/,'N'],
    [qr/GN$/,'N'],
    [qr/GHN/,'N'],
    [qr/GNE$/,'N'],
    [qr/GHNE/,'NE'],
    [qr/GNES$/,'NS'],
    [qr/^GN/,'N'],
    [qr/(\w)GN($CONSONANT)/,'1N2'],
    [qr/^PS/,'S'],
    [qr/^PT/,'T'],
    [qr/^CZ/,'C'],
    [qr/($VOVEL)WZ(\w)/,'1Z2'],
    [qr/(\w)CZ(\w)/,'1CH2'],
    [qr/LZ/,'LSH'],
    [qr/RZ/,'RSH'],
    [qr/(\w)Z($VOVEL)/,'1S2'],
    [qr/ZZ/,'TS'],
    [qr/($CONSONANT)Z(\w)/,'1TS2'],
    [qr/HROUGH/,'REW'],
    [qr/OUGH/,'OF'],
    [qr/($VOVEL)Q($VOVEL)/,'1KW2'],
    [qr/($VOVEL)J($VOVEL)/,'1Y2'],
    [qr/^YJ($VOVEL)/,'Y1'],
    [qr/^GH/,'G'],
    [qr/($VOVEL)GH$/,'1E'],
    [qr/^CY/,'S'],
    [qr/NX/,'NKS'],
    [qr/^PF/,'F'],
    [qr/DT$/,'T'],
    [qr/(T|D)L$/,'1IL'],
    [qr/YTH/,'ITH'],
    [qr/^TS?J($VOVEL)/,'CH1'],
    [qr/^TS($VOVEL)/,'T1'],
    [qr/TCH/,'CH'], # old che
    [qr/^($VOVEL)WSK/,'1VSIKE'],
    [qr/^[PM]N($VOVEL)/,'N1'],
    [qr/($VOVEL)STL/,'1SL'],
    [qr/TNT$/,'ENT'],
    [qr/EAUX$/,'OH'],
    [qr/EXCI/,'ECS'],
    [qr/X/,'ECS'],
    [qr/NED$/,'ND'],
    [qr/JR/,'DR'],
    [qr/EE$/,'EA'],
    [qr/ZS/,'S'],
    [qr/($VOVEL)H?R($CONSONANT)/,'1AH2'],
    [qr/($VOVEL)HR$/,'1AH'],
    [qr/RE$/,'AR'],
    [qr/($VOVEL)R$/,'1AH'],
    [qr/LLE/,'LE'],
    [qr/($CONSONANT)LE(S?)$/,'1ILE2'],
    [qr/E$/,''],
    [qr/ES$/,'S'],
    [qr/($VOVEL)SS/,'1AS'],
    [qr/($VOVEL)MB$/,'1M'],
    [qr/MPTS/,'MPS'],
    [qr/MPS/,'MS'],
    [qr/MPT/,'MT'],

);

#sub _do_compare {
#	my $obj = shift;
#	my $result1 = shift;
#	my $result2 = shift;
#
#	# Main values are different
#	return 0 unless ($result1->[0] eq $result2->[0]);
#	
#	# Ending values the same
#	return 75 if ($result1->[1] eq $result2->[1]);
#	
#	# Ending values differ in length, and are same for the shorter
#	my $length1 = length $result1->[1];
#	my $length2 = length $result2->[1];
#	if ($length1 > $length2
#		&& $length1 - $length2 == 1) {
#		return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]);
#	 }elsif ($length2 > $length1
#		&& $length2 - $length1 == 1) {	
#		return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]);
#	}
#	
#	return 25;
#}
#The algorithm always returns either a scalar value or an array reference with 
#two elements. The fist element represents the sound of the name without the 
#ending sound, and the second element represents the ending sound. To get a 
#full representation of the name you need to concat the two elements.
#
#If you want to compare two names the following rules apply:
#
#=over
#
#=item * If the ending sound values of an entered name and a retrieved name are 
#the same, the retrieved name is a LIKELY candidate.
#
#=item * If an entered name has an ending-sound value, and the retrieved name 
#does not, then the retrieved name is a LEAST-LIKELY candidate.
#
#=item * If the two ending-sound values are the same for the length of the 
#shorter, and the difference in length between the two ending-sound is one 
#digit only, then the retrieved name isa LESS-LIKELY candidate.
#
#=item * All other cases result in LEAST-LIKELY candidates.
#
#=back

sub _do_encode {
    my ($self,$string) = @_;
    
    my ($original_string, $first_char);
    $original_string = $string;
    
    # To uppercase and remove other characters
    $string = uc($string);
    $string =~ tr/A-Z//cd;
    
    # RULE 1: Replcace rule
    foreach my $rule (@RULES) {
        my $regexp = $rule->[0];
        my $replace = $rule->[1];
        $string =~ s/$regexp/_replace($replace,$1,$2)/ge;
    }
    
    # RULE 2: Fetch first character
    $first_char = substr($string,0,1,'');
    
    # RULE 3: Exceptions for first character rule
    if (grep { $first_char eq $_ } qw(A E I O U Y)) {
        $first_char = 'v';
        $string =~ s/^$VOVEL_WITHY//;
    } elsif ($first_char eq 'W' || $first_char eq 'H') {
        #$string =~ s/^[WH]//;
    }
    
    # RULE 4
    $string =~ s/ES$/S/;
    # RULE 5
    $string =~ s/($VOVEL_WITHY)$/$1E/;
    # RULE 6
    #$string =~ s/\w$//; # This rule seems kind of strict
    # RULE 7-8
#   if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) {
#       # RULE 13
#       $last_string = _transform($2);
#   }
    
    # RULE 9-11
    $string = _transform($string);
    
    # RULE 12
    $string = $first_char.$string;
    
    #$string .= $last_string if (defined $last_string);
    $string .= '0'  x (8-length $string);
    $string = substr($string,0,8);
    
    return $string;
}

sub _transform {
    my $string = shift;
    return unless defined $string;
    
    # RULE 9
    $string =~ s/([AEIOUYHW])//g;
    # RULE 10
    $string =~ s/($CONSONANT+)\1/$1/g;
    # RULE 11
    foreach my $value (@VALUES) {
        my $regexp = $value->[0];
        $string =~ s/$regexp/$value->[1]/g;
    }
    return $string;
}

sub _replace {
    my $replace = shift;
    my $pos1 = shift;
    my $pos2 = shift;
    
    $replace =~ s/1/$pos1/ if (defined $pos1);
    $replace =~ s/2/$pos2/ if (defined $pos2);
    
    return $replace;
}

1;