| WWW-BookBot documentation | Contained in the WWW-BookBot distribution. |
WWW::BookBot::Chinese - Virtual class of bots to process chinese e-texts.
use WWW::BookBot::Chinese::Novel::DragonSky;
my $bot=WWW::BookBot::Chinese::Novel::DragonSky->new({work_dir=>'/output'});
$bot->go_catalog({});
use WWW::BookBot::Chinese::Novel::ShuKu;
my $bot=WWW::BookBot::Chinese::Novel::ShuKu->new({});
$bot->go_catalog({desc=>'NewNovel', cat1=>0, cat2=>1, pageno=>0});
Virtual class of bots to process chinese e-texts.
Virtual class of bots to process chinese e-texts.
to be added.
None by default.
Please report any requests, suggestions or bugs via http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WWW-BookBot
Qing-Jie Zhou <qjzhou@hotmail.com>
| WWW-BookBot documentation | Contained in the WWW-BookBot distribution. |
package WWW::BookBot::Chinese; use 5.008; use strict; use warnings; no warnings qw(uninitialized); use base qw(WWW::BookBot); use vars qw($VERSION); $VERSION = '0.12'; #------------------------------------------------------------- # Default settings # $class->default_settings => \%settings #------------------------------------------------------------- sub default_settings { my $self = shift->SUPER::default_settings; $self->{get_language}='zh-cn'; $self->{language_decode}='gbk'; $self->{language_encode}='gbk'; $self; } #------------------------------------------------------------- # Redefined functions # $bot->decode_entity($content_dein_deout) => N/A # $bot->trandict_init => $bot->{translate_dict} # $bot->msg_init => $bot->{messages} #------------------------------------------------------------- sub decode_entity { #chinese novels sometimes add \x{FF1B} after unkown unicode string $_[1]=~s/(?:&\#(\d{1,5});?\x{FF1B}?)/chr($1)/esg; $_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?\x{FF1B}?)/chr(hex($1))/esg; $_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$WWW::BookBot::entity2char{$2} or $1/esg; #normalize middle dot $_[1]=~s/\x{2022}/\x{00B7}/sg; } sub trandict_init { shift->{translate_dict} = { 'log' => "ÈÕÖ¾", 'result' => "½á¹û", 'DB' => "Êý¾Ý", 'debug' => "µ÷ÊÔ", } } sub msg_init { my $skip_info="\n".'$pargs->{levelspace} url=$pargs->{url}'."\n"; shift->{messages} = { TestMsg => '²âÊÔ: $pargs->{TestInfo} $pargs->{TestNum}', BookStart => '$pargs->{levelspace} [$pargs->{bpos_limit}/$pargs->{book_num}] $pargs->{title_limit} ', BookBinaryOK => '$pargs->{data_len_KB} $pargs->{write_file}'."\n", BookChapterErr => ' - ÎÞ·¨·ÖÎö'.$skip_info, BookChapterMany => '[$pargs->{chapter_num_limit}ÕÂ]', BookChapterOne => '[µ¥Õ½Ú]', BookChapterOK => '$pargs->{data_len_KB}'."\n", BookTOCFinish => '$pargs->{TOC_len_KB}'."\n", CatalogInfo => 'È¡ÊéÄ¿: ', CatalogResultErr=> ' 0Ì×Êé'."\n", CatalogResultOK => ' $pargs->{book_num}Ì×Êé'."\n", CatalogURL => '$pargs->{url}', CatalogURLEmpty => '[ʧ°Ü] Ë÷ÒýµÄURLΪ¿Õ'."\n", DBBookErr => "\t".' \$bot->go_book({$pargs->{allargs}});'."\t#´íÎó\n", DBBookOK => "\t".'#\$bot->go_book({$pargs->{allargs}});'."\n", DBCatalogErr => ' \$bot->go_catalog({$pargs->{allargs}});'."\t#´íÎó\n", DBCatalogOK => '#\$bot->go_catalog({$pargs->{allargs}});'."\n", DBHead => <<'DATA', #!$pargs->{perlcmd} ##====================================== ## ×Ô¶¯Éú³ÉµÄÊý¾ÝÎļþ£¬ÓÃÓÚ$pargs->{classname} ## Éú³Éʱ¼ä: $pargs->{createtime} ##====================================== use $pargs->{classname}; my \$bot = new $pargs->{classname}; DATA FailClearDB => 'ÎÞ·¨Çå³ýÊý¾ÝÎļþ$pargs->{filename}: $pargs->{errmsg}', FailClose => 'ÎÞ·¨¹Ø±Õ$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}', FailMkDir => '½¨Ä¿Â¼$pargs->{dir}ʧ°Ü: $pargs->{errmsg}', FailOpen => 'ÎÞ·¨´ò¿ª$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}', FailWrite => 'ÎÞ·¨Ð´Èë$self->{translate_dict}->{$pargs->{filetype}}Îļþ$pargs->{filename}: $pargs->{errmsg}', GetFail404 => <<'DATA', [$pargs->{code},ʧ°Ü] ÕÒ²»µ½Îļþ $pargs->{url_real} DATA GetFail404Detail=> <<'DATA', [$pargs->{code},ʧ°Ü] ÕÒ²»µ½Îļþ >>>>ÇëÇó $pargs->{req_content}<<<<ÏìÓ¦ $pargs->{status_line} DATA GetFailRetries => <<'DATA', [$pargs->{code},ʧ°Ü] ÖØÊÔÌ«¶à£¬·ÅÆú $pargs->{url_real} DATA GetFailRetriesDetail => <<'DATA', [$pargs->{code},ʧ°Ü] ÖØÊÔÌ«¶à£¬·ÅÆú >>>>ÇëÇó $pargs->{req_content}<<<<ÏìÓ¦ $pargs->{status_line} $pargs->{res_content} DATA GetURLSuccess => '$pargs->{len_KB} ', GetURLRetry => '[$pargs->{code},ÖØÊÔ] ', GetWait => 'µÈ´ý..', SkipMaxLevel => '[Ìø¹ý]²ãÊý>$self->{book_max_levels}'.$skip_info, SkipMedia => '[Ìø¹ý]ýÌåÎļþ'.$skip_info, SkipTitleEmpty => '[Ìø¹ý]±êÌâΪ¿Õ'.$skip_info, SkipUrlEmpty => '[Ìø¹ý]µØÖ·Îª¿Õ'."\n", SkipVisited => '[Ìø¹ý]ÒÑ·ÃÎʹý'."\n", SkipZip => '[Ìø¹ý]ѹËõÎļþ'.$skip_info, }; } #------------------------------------------------------------- # patterns #------------------------------------------------------------- sub getpattern_space2_data { <<'DATA'; [¡¡£ ¡@] DATA } sub getpattern_line_head_data { '¡¡¡¡'; } sub getpattern_parentheses_data { shift->SUPER::getpattern_parentheses_data().<<'DATA'; ¡¨ ¡¨ ¡® ¡¯ ¡° ¡± ¡² ¡³ ¡´ ¡µ ¡¶ ¡· ¡¸ ¡¹ ¡º ¡» ¡¼ ¡½ ¡¾ ¡¿ ¡ä ¡ä ¡å ¡å £¢ £¢ £§ £§ £¨ £© £¼ £¾ £Û £Ý £à £à £à £§ £û £ý ¦à ¦á ¦â ¦ã ¦ä ¦å ¦æ ¦ç ¦è ¦é ¦ê ¦ë ¦î ¦ï ¦ð ¦ñ ¨A ¨@ ¨F ¨F ¨ ¨ ©v ©w ©x ©y ©z ©{ © © DATA } sub getpattern_mark_dash_data { <<'DATA'; [#-&\*\+\-=@_~¡¥¡ª¡«¡¬¡¡Á¡Â¡Ë¡Ñ¡Ô¡Ö¡×¡Þ¡ç¡è¡é¡ë¡ì£££¥£¦£ª£«££½£À£ß£ü¨C¨D¨E¨O©W©\©`©¤-¡á©¡þ¡ù¦ò-¦õ©h-©n©~©©©©©©] DATA } sub getpattern_mark_wordsplit_data { <<'DATA'; [\.\,\?\!\:\;¡Ã¡¢¡£¡¤£¡£¬£®£º£»£¿©U©o©p©q©r©s©t©u] DATA } sub getpattern_word_finish_data { <<'DATA'; (?:È«[ÎÄÊé]|)[ÍêÖÕ] DATA } sub getpattern_remove_line_by_end_data { <<'DATA'; (case) [±¨ÍøÉçѶ] [Á¬ÖØÅÅÕû³öÌáÍÆÉ¨Ð£½Ï±àÊéÊÀÊÓÎÄ¿ÆÔÚÌÖС¹¤×ª][ѧ»ÃÂÛ×÷]?(?:[ÔØÌùÅŰæÀíÆ·¹©³öÈëУ½ÏÃèÕý¶ÔÕßÎÝ¿â³Ç·½çÔ·ÏßÇø×éÊÒ]|º£Ñó|ÍûÔ¶¾µ|ÌÒ»¨Ô´|-K12)(?:Íê³É|) Çë(?:ÉêÇëÊÚȨ|±£Áôվ̨ÐÅÏ¢)[¡££®©q\.£¡©u]? ÖÆ×÷ [Oo£Ï£ï][Cc£Ã£ã][Rr£Ò£ò] ²É±àÖÐÐÄ Òà·²¹«ÒæÍ¼Êé¹Ý ÁúµÄÌì¿Õ ʧÂäµÄÐdz½ ÊéÏãÃÅµÚ ¾ÉÓêÂ¥ Ò»½£Ð¡ÌìÏ Öñ¶ºÉ·ç Ñï½£Ðù¾ÓÊ¿ »ÃÏëʱ´ú ðÏÕÕßÌìÌà ÐÅÏ¢ÖÐÐÄ cnread[\.¡££®¡¤©q]net ezla[\.¡££®¡¤©q]com?[\.¡££®¡¤©q]tw thebook[\.¡££®¡¤©q]yeah[\.¡££®¡¤©q]net y(?:esho[\.¡££®¡¤©q]com/wenxue|uzispy[\.¡££®¡¤©q]yeah[\.¡££®¡¤©q]net) www[\.¡££®¡¤©q](?:v-war|oldrain)[\.¡££®¡¤©q](?:net|com) DATA } sub getpattern_remove_line_by_end_special_data { <<'DATA'; ±¨ÍøÉçѶ DATA } 1; __END__