/usr/local/CPAN/Baseball-Sabermetrics/Baseball/Sabermetrics/League/CPBL.pm
package Baseball::Sabermetrics::League::CPBL;
use LWP::UserAgent;
use HTML::TableExtract;
use Encode qw/ encode decode /;
use Data::Serializer;
use constant CACHEFILE => 'cachefile';
use strict;
sub new
{
my ($class, %config) = @_;
my $data = Data::Serializer->new();
if (-f CACHEFILE && !$config{nocache}) {
my $mtime = (stat CACHEFILE)[9];
# just a heuristic for speeding up
if ($config{usecache} || time - $mtime < 6 * 3600) {
print STDERR "cache hit\n";
return $data->retrieve(CACHEFILE);
}
}
my $league = {};
my $teams = $league->{teams} = {};
$teams->{bulls} = { code => 'B02', name => 'bulls', company => "èè¾²" };
$teams->{cobras} = { code => 'G01', name => 'cobras', company => "èª æ³°" };
$teams->{elephants} = { code => 'E01', name => 'elephants', company => "å
å¼" };
$teams->{whales} = { code => 'W01', name => 'whales', company => "ä¸ä¿¡" };
$teams->{lions} = { code => 'L01', name => 'lions', company => "çµ±ä¸" };
$teams->{bears} = { code => 'A02', name => 'bears', company => "La New" };
extract_score($league, "http://www.cpbl.com.tw/Score/FightScore.aspx");
for my $team (values %$teams) {
print "Fetching records of team $team->{name}\n";
my $code = $team->{code};
$team->{players} = {};
extract_record(
$team,
"http://www.cpbl.com.tw/teams/Team_Pitcher.aspx?Tno=$code",
[qw/ name p_game gs _ _ win lose tie sv bs hld _ cg sho /]);
extract_record(
$team,
"http://www.cpbl.com.tw/teams/Team_Pitcher.aspx?Tno=$code&page=2",
[qw/ name ip p_pa np h_allowed hr_allowed sh_allowed sf_allowed p_bb p_ibb hb p_so wp bk ra er /]);
extract_record(
$team,
"http://www.cpbl.com.tw/teams/Team_Hitter.aspx?Tno=$code",
[qw/ name game pa ab rbi r h 1b 2b 3b hr tb dp /]);
extract_record(
$team,
"http://www.cpbl.com.tw/teams/Team_Hitter.aspx?Tno=$code&page=2",
[qw/ name sh sf 4ball ibb bb so sb cs /]);
# fgame stards for fielding games
extract_fielding($team, "http://www.cpbl.com.tw/teams/Team_defend.aspx?Tno=$code");
# fix something up
for my $p (values %{$team->{players}}) {
$p->{ip} = int($p->{ip}) + ($p->{ip} - int $p->{ip}) * 10 / 3
if exists $p->{ip};
if (exists $p->{'4ball'}) {
#$p->{hbp} = $p->{bb} - $p->{ibb} - $p->{'4ball'};
# XXX I'm not sure whether ibb is counted in bb in cpbl.com.tw
$p->{hbp} = $p->{bb} - $p->{'4ball'};
delete $p->{'4ball'};
}
else {
# XXX I'm not sure whether ibb is counted in bb in cpbl.com.tw
#$p->{p_bb} = $p->{p_bb} + $p->{p_ibb};
}
}
}
$data->store($league, CACHEFILE);
$league;
}
sub get_content
{
my $url = shift;
my $page = LWP::UserAgent->new();
my $ua = $page->get($url);
if (not $ua->is_success) {
die "failed to fetch url $url";
}
return $ua->content;
}
sub get_table_in_html
{
my ($page, $attribs) = @_;
my $t;
if (exists $attribs->{cpb2_choose_table}) {
$t = $attribs->{cpb2_choose_table};
delete $attribs->{cpb2_choose_table};
}
else {
$t = 0;
}
my $te = HTML::TableExtract->new(attribs => $attribs);
$te->parse($page =~ /^http/ ? get_content($page) : $page);
my @tables = $te->tables;
die "No table is found" unless @tables;
return $tables[$t];
}
sub extract_table_record
{
my ($page, $attribs, $hash, $n_dummy, $keys) = @_;
my $table = get_table_in_html($page, $attribs);
my @rows = $table->rows;
shift @rows for (1 .. $n_dummy);
my $i = 0;
while ($keys->[$i] eq '_') {
++$i;
}
for my $p_row (@rows) {
my @col = @$p_row;
$col[$i] =~ s/\s//g unless $keys->[$i] eq 'company';
$hash->{$col[$i]} = {} unless exists $hash->{$col[$i]};
my $h = $hash->{$col[$i]};
for ($i + 1 ..@$keys - 1) {
my $key = $keys->[$_];
if ($key eq '_') {
next;
}
$h->{$key} = $col[$_];
}
}
}
sub extract_score
{
my ($league, $url) = @_;
my $hash = {};
extract_table_record($url, { class => 'Report_Table_score', cpb2_choose_table => 2 }, $hash, 2,
[qw/ company game score /]);
for my $key (keys %$hash) {
my $name = $key;
$name =~ s/\d\.//;
my $team = $hash->{$key};
my ($t) = grep { $_->{company} eq $name } values %{$league->{teams}};
$t->{game} = $team->{game};
($t->{win}, $t->{tie}, $t->{lose}) = ($team->{score} =~ /(\d+)å(\d+)å(\d+)æ/);
}
}
sub extract_record
{
my ($team, $url, $cols) = @_;
extract_table_record($url, { class => 'Report_Table' }, $team->{players}, 2, $cols);
}
sub fix_cpbls_bug
{
my ($team, $ids) = @_;
my $content = get_content("http://www.cpbl.com.tw/teams/Team_Hitter.aspx?Tno=$team->{code}");
my %hash = @$ids;
my %hash2 = $content =~ m!href="\.\./personal_Rec/pbat_personal\.aspx\?Pno=([^"]+)">([^<]+)<!gs;
my %hash3 = (%hash, %hash2);
@$ids = %hash3;
}
sub extract_fielding
{
my ($team, $url) = @_;
my $content = get_content($url);
my @ids = $content =~ m!href="\.\./personal_Rec/pbat_personal\.aspx\?Pno=([^"]+)">([^<]+)<!gs;
fix_cpbls_bug($team, \@ids);
while (@ids) {
my $id = shift @ids;
my $name = shift @ids;
$name =~ s/\s//g;
my $p = $team->{players}->{$name};
my $year = 2006;
my $hash = {};
extract_table_record("http://www.cpbl.com.tw/personal_Rec/pbat_personal.aspx?Pno=$id", { class => 'Report_Table_pdf' }, $hash, 1,
[qw/ year _ fgame tc po a e f_dp tp pb c_cs c_sb _ /]);
my $h = $hash->{$year - 1989};
next unless $h;
while (my ($key, $value) = each %$h) {
$p->{$key} = $value;
}
$hash = {};
extract_table_record("http://www.cpbl.com.tw/personal_Rec/pdf_detail.aspx?pbyear=$year&Pno=$id", { class => 'Report_Table_pdf' }, $hash, 1,
[qw/ _ pos fgame tc po a e f_dp tp pb c_cs c_sb _ /]);
while (my ($key, $value) = each %$hash) {
my %posname = ( 'ææ' => 'p', 'ææ' => 'c', 'ä¸å£' => 'b1', 'äºå£' => 'b2', 'ä¸å£' => 'b3', '游æ' => 'ss', 'å·¦å¤é' => 'lf', 'ä¸å¤é' => 'cf', 'å³å¤é' => 'rf' );
$p->{fielding}->{$posname{$key}} = $value;
}
$content = get_content("http://www.cpbl.com.tw/personal_Rec/pbat_personal.aspx?Pno=$id");
$content =~ m!æ æï¼<span id="lbl_bpcustom">(.+)</span>!;
my $type = $1;
$p->{bio} = {};
if ($type =~ /éå¼/) {
$p->{bio}->{bats} = "switch";
$p->{bio}->{throws} = $type =~ /å·¦æ/ ? 'left' : 'right';
}
elsif ($type =~ /(.+)æ(.+)æ/) {
$p->{bio}->{bats} = $1 eq "å·¦" ? 'left' : 'right';
$p->{bio}->{throws} = $2 eq "å·¦" ? 'left' : 'right';
}
else {
die "unrecognized format: $type";
}
}
}
1;