/usr/local/CPAN/DBD-Google/DBD/Google/parser.pm
package DBD::Google::parser;
# ----------------------------------------------------------------------
# This package needs to subclass SQL::Parser, in order that the
# functions defined can be used. WIth SQL::Parser 1.005, the
# SELECT_CLAUSE method needs to be overridden.
#
# Jeff Zucker tells me that SQL::Parser 1.006 is coming out
# soon, and that it will support more functions and such. There
# might need to be some logic in here to ensure that an incompatible
# version of SQL::Parser is not being used.
# ----------------------------------------------------------------------
use strict;
use base qw(SQL::Parser);
use vars qw($VERSION $FIELD_RE $FUNC_RE);
use Carp qw(carp);
use File::Spec::Functions qw(catfile);
use HTML::Entities qw(encode_entities);
use SQL::Parser;
use URI::Escape qw(uri_escape);
$VERSION = "2.00";
# Package-scoped variables
# These are not lexicals so that they can be used in tests
$FIELD_RE = '[a-zA-Z][a-zA-Z0-9_]';
$FUNC_RE = qr/$FIELD_RE*(?:::$FIELD_RE*)*(?:[-]>$FIELD_RE*)?/; # methods?
$FIELD_RE = qr/$FIELD_RE*/;
my @default_columns = sort qw( title url snippet summary
cachedSize directoryTitle
hostName directoryCategory
);
my %allowed_columns = map { $_ => 1 }
map { $_, lc $_, uc $_ }
@default_columns;
for my $dc (@default_columns) {
$dc =~ s/([A-Z])/_\l$1/g;
$allowed_columns{$dc} = 1;
}
# All functions are passed two items: the Net::Google::Search
# instanace and the text to be fiddled with.
my %functions = (
'default' => sub { $_[1] },
'uri_escape' => sub { uri_escape($_[1]) },
'html_escape' => sub { encode_entities($_[1]) },
'count' => \&count,
'html_strip' => \&striphtml,
);
$functions{''} = $functions{'default'};
# ----------------------------------------------------------------------
# new(@stuff)
#
# Override SQL::Parser's new method, but only so that default values
# can be used.
# ----------------------------------------------------------------------
sub new { return shift->SUPER::new("Google", @_) }
# ----------------------------------------------------------------------
# SELECT_CLAUSE($sql)
#
# Parses the SELECT portion of $sql, which contains only the pieces
# between SELECT and WHERE. Understands the following syntax:
#
# field
#
# field AS alias
#
# field AS "alias"
#
# function(field)
#
# function(field) AS alias
#
# function(field) AS "alias"
#
# package::function(field)
#
# package::function(field) AS alias
#
# package::function(field) AS "alias"
#
# package->method(field)
#
# package->method(field) AS alias
#
# package->method(field) AS "alias"
#
# ----------------------------------------------------------------------
sub SELECT_CLAUSE {
my ($self, $sql) = @_;
#warn "Got: \$sql => '$sql'\n";
my ($columns, $limit, @columns, @limit, $where, $parsed);
# SQL::Parser::clean_sql does funny things to strings
# that look like methods
$sql = $self->unclean_cleaned_sql($sql);
# Internal data structures, given shorter names
my $column_names = $self->{'struct'}->{'column_names'} = [ ];
my $ORG_NAME = $self->{'struct'}->{'ORG_NAME'} = { };
my $functions = $self->{'struct'}->{'column_functions'} = { };
my $aliases = $self->{'struct'}->{'column_aliases'} = { };
my $errstr = \$self->{'struct'}->{'errstr'};
# columns
while ($sql =~ /\G
# Field name, including possible function
(?:
($FUNC_RE\s*\([^)]+\)) # $1 => function
|
($FIELD_RE) # $2 => field name
| (\*) # $3 => '*'
)
# possible alias
(?:
\s+
[aA][sS]
\s+
(['"]?) # $4 => possibly quoted
\s*
($FIELD_RE) # $5 => alias (no spaces allowed!)
\s*
\4?
)?
\s*
,?
\s*
/xsmg) {
my $alias = $5 || "";
my $function = $1 || "";
#warn "\$function => '$function'\n\$alias => '$alias'\n";
if (defined $3) {
# SELECT * -> expanded to all column names
my $df = $functions{'default'};
for (@default_columns) {
my $uc_ = uc $_;
push @$column_names => $_ ;
$ORG_NAME->{ $uc_ } = $_ ;
$functions->{ $uc_ } = $df ;
$aliases->{ $uc_ } = $_ ;
}
}
elsif ($function) {
# SELECT foo(bar)
my $original = $function;
$original =~ /($FUNC_RE)\s*\((.*?)\)/;
# XXX $n here might contains arguments; needs to be
# passed to String::Shellquote to extract tokens
my ($f, $n) = ($1, $2);
$n =~ s/(^\s*|\s*$)//g;
$f = "" unless defined $f;
unless ($allowed_columns{$n}) {
$$errstr = "Unknown column name '$n'";
return;
}
# Possible cases include:
# 1. No function defined
# 2. Function defined that we know about
# 3. Function defined we don't know about
# 3a. Function/method to be loaded
# 3b. Error
if ($f) {
if (defined $functions{$f}) {
# Common case:
#
# SELECT html_strip(title) FROM google ...
#
# A pre-defined function.
$f = $functions{$f};
}
else {
# If a user specifies a function like:
#
# SELECT Digest::MD5::md5_hex(title) FROM google ...
#
# or:
#
# SELECT URI->new(URL) FROM google ...
#
if (my ($package, $type, $func) = $f =~ /(.*)(::|[-]>)(.*)/) {
eval "use $package;";
if ($@) {
$$errstr = $@;
return;
}
else {
if ($type eq '::') {
if (defined(my $g = \&{"$package\::$func"})) {
$f = sub { shift; &$g(@_) };
} else {
$$errstr = "Can't load $package\::$func";
}
}
elsif ($type eq '->') {
$f = sub { shift; $package->$func(@_) };
}
else {
$f = $functions{'default'};
}
}
}
else {
# Function that matches $FUNC_RE but doesn't contain
# :: or ->; might be a built-in, such as uc, lc,
# gethostbyname, unlink, or even
# 'system("GET www.pr0n.com | mail ceo@my.company")'.
#
# This sucks, BTW.
$f = eval qq(sub { $f(\$_[1]) };);
}
}
}
else {
# No function:
#
# SELECT title FROM google ...
$f = $functions{'default'};
}
my $uc = uc $n;
push @$column_names, $n;
$ORG_NAME->{ $uc } = $n;
$functions->{ $uc } = $f;
$aliases->{ $uc } = $alias ? $alias : $n;
}
elsif (defined $2) {
my $lc = lc $2;
my $uc = uc $2;
if ($allowed_columns{$lc}) {
push @$column_names, $lc;
$ORG_NAME->{ $uc } = $lc;
$functions->{ $uc } = $functions{'default'};
$aliases->{ $uc } = $alias ? $alias : $lc;
} else {
$$errstr = "Unknown column name '$2'";
return;
}
}
}
1;
}
# ----------------------------------------------------------------------
# decompose()
#
# Returns a data structure, similar to the structure() method, that
# contains only what DBD::Google::db needs to pass to Net::Google.
# The data structure looks like:
#
# {
# QUERY => "query string",
# COLUMNS => [
# {
# FIELD => "Net::Google methodname",
# FUNCTION => sub { },
# ALIAS => "alias",
# },
# ],
# LIMIT => {
# limit => X,
# offset => Y,
# },
# }
# ----------------------------------------------------------------------
sub decompose {
my $self = shift;
my $struct = $self->structure;
my %data = ();
# Limit (use defaults of 0, 10)
$data{'LIMIT'} = $struct->{'limit_clause'} || { offset => 0, limit => 10 };
# Where
$data{'WHERE'} = $struct->{'where_clause'}->{'arg2'}->{'value'} || "";
$data{'WHERE'} =~ tr/'"//d;
# Columns
$data{'COLUMNS'} = [
map { { FIELD => $struct->{'ORG_NAME'}->{$_},
FUNCTION => $struct->{'column_functions'}->{$_},
ALIAS => $struct->{'column_aliases'}->{$_},
} } @{ $struct->{'column_names'} }
];
return wantarray ? %data : \%data;
}
# ----------------------------------------------------------------------
# unclean_cleaned_sql($sql)
#
# Undo some of the damage that SQL::Parser::clean_sql does to functions
# that look like Perl methods, e.g., Foo::Bar->new(title) gets turned
# into Foo::Bar- > new (title), which is no good.
# ----------------------------------------------------------------------
sub unclean_cleaned_sql {
my ($self, $sql) = @_;
$sql =~ s/\s*([-<>])\s*/$1/g;
return $sql;
}
# ----------------------------------------------------------------------
# striphtml($ng, $text)
#
# A function for stripping HTML. Very naive; it it becomes an
# issue, I'll include TCHRIST's striphtml.
# ----------------------------------------------------------------------
sub striphtml {
my $text = $_[1];
$text =~ s#</?[^>]*>##smg;
return $text;
}
# ----------------------------------------------------------------------
# count($ng)
#
# Returns the total number of results.
# ----------------------------------------------------------------------
sub count {
my $ngs = shift; # Net::Google::Search instance
my $res = $ngs->response;
return $res->estimateTotalResultsNumber;
}
1;
__END__
NOTES
Tim Buunce suggested count(*) as a way to get the total number of search results.
Data structure of SQL::Parser instance after parsing looks like:
'struct' => {
'org_table_names' => [
'google'
],
'column_names' => [
'*'
],
'table_alias' => {},
'command' => 'SELECT',
'table_names' => [
'GOOGLE'
],
'org_col_names' => [
'*'
]
},