Apache::GuessCharset - adds HTTP charset by guessing file's encoding


Apache-GuessCharset documentation Contained in the Apache-GuessCharset distribution.

Index


Code Index:

NAME

Top

Apache::GuessCharset - adds HTTP charset by guessing file's encoding

SYNOPSIS

Top

  SetHandler perl-script
  PerlFixupHandler +Apache::GuessCharset

  # how many bytes to read for guessing (default 512)
  PerlSetVar GuessCharsetBufferSize 1024

  # list of encoding suspects
  PerlSetVar GuessCharsetSuspects euc-jp
  PerlAddVar GuessCharsetSuspects shiftjis
  PerlAddVar GuessCharsetSuspects 7bit-jis

DESCRIPTION

Top

Apache::GuessCharset is an Apache fix-up handler which adds HTTP charset attribute by automaticaly guessing text files' encodings via Encode::Guess.

CONFIGURATION

Top

This module uses following configuration variables.

GuessCharsetSuspects

a list of encodings for Encode::Guess to check. See Encode::Guess for details.

GuessCharsetBufferSize

specifies how many bytes for this module to read from source file, to properly guess encodings. default is 512.

AUTHOR

Top

Tatsuhiko Miyagawa <miyagawa@bulknews.net>

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

SEE ALSO

Top

Encode::Guess, Apache::File


Apache-GuessCharset documentation Contained in the Apache-GuessCharset distribution.

package Apache::GuessCharset;

use strict;
our $VERSION = 0.03;
our $DEBUG   = 0;

use Apache::Constants;
use Apache::File;
use Encode::Guess;
use I18N::Charset;

# generated from http://www.iana.org/assignments/character-sets
# '/Name: (\S*)/ and $name = $1; /Alias: (\S*).*preferred MIME/ and print qq("$name" => "$1",)'
our %Prefered_MIME = (
    "ANSI_X3.4-1968" => "US-ASCII",
    "ISO_8859-1:1987" => "ISO-8859-1",
    "ISO_8859-2:1987" => "ISO-8859-2",
    "ISO_8859-3:1988" => "ISO-8859-3",
    "ISO_8859-4:1988" => "ISO-8859-4",
    "ISO_8859-6:1987" => "ISO-8859-6",
    "ISO_8859-6-E" => "ISO-8859-6-E",
    "ISO_8859-6-I" => "ISO-8859-6-I",
    "ISO_8859-7:1987" => "ISO-8859-7",
    "ISO_8859-8:1988" => "ISO-8859-8",
    "ISO_8859-8-E" => "ISO-8859-8-E",
    "ISO_8859-8-I" => "ISO-8859-8-I",
    "ISO_8859-5:1988" => "ISO-8859-5",
    "ISO_8859-9:1989" => "ISO-8859-9",
    "Extended_UNIX_Code_Packed_Format_for_Japanese" => "EUC-JP",
);

sub handler {
    my $r = shift;
    return DECLINED if
	! $r->is_main                  or
	$r->content_type !~ m@^text/@  or
	$r->content_type =~ /charset=/ or
	! -e $r->finfo                 or
	-d _                           or
	!(my $chunk = read_chunk($r));

    my @suspects = $r->dir_config->get('GuessCharsetSuspects');
    my $enc  = guess_encoding($chunk, @suspects);
    unless (ref $enc) {
	warn "Couldn't guess encoding: $enc" if $DEBUG;
	return DECLINED;
    }

    my $iana    = iana_charset_name($enc->name);
    my $charset = lc($Prefered_MIME{$iana} || $iana); # lowercased
    warn "Guessed: $charset" if $DEBUG;
    $r->content_type($r->content_type . "; charset=$charset");
    return OK;
}

sub read_chunk {
    my $r  = shift;
    my $fh = Apache::File->new($r->filename) or return;
    my $buffer_size = $r->dir_config('GuessCharsetBufferSize') || 512;
    read $fh, my($chunk), $buffer_size;
    return $chunk;
}

1;
__END__