DBIx::MyParsePP::Lexer - Pure-perl SQL lexer based on MySQL's source


DBIx-MyParsePP documentation Contained in the DBIx-MyParsePP distribution.

Index


Code Index:

NAME

Top

DBIx::MyParsePP::Lexer - Pure-perl SQL lexer based on MySQL's source

SYNOPSIS

Top

	use DBIx::MyParsePP::Lexer;
	use Data::Dumper;

	my $lexer = DBIx::MyParsePP::Lexer->new(
		string => $string
	);

	while ( my $token = $lexer->yylex() ) {

		print Dumper $token;

		last if $token->type() eq 'END_OF_INPUT';
		print $lexer->pos();
		print $lexer->line();

	}

DESCRIPTION

Top

DBIx::MyParsePP::Lexer is a translation of the lexer function from MySQL into pure Perl.

The goal of the translation was to closely follow the method of operation of the original lexer -- therefore performance is suffering at the expense of compatibility. For example, the original character set definitions are used, rather than determining which letter is uppercase or lowercase using a Perl regular expression.

CONSTRUCTOR

Top

The following arguments are available for the constructor. They are passed from DBIx::MyParsePP:

string is the string being parsed.

charset is the character set of the string. This is important when determining what is a number and what is a separator in the string. The default value is 'ascii', which is the only charset bundled with DBIx::MyParsePP by default. Please contact the author if you need support for other character sets.

version is the MySQL version to be emulated. This only affects the processing of /*!##### sql_clause */ comments, where ##### is the minimum version required to process sql_clause. The grammar itself is taken from MySQL 5.0.45, which is the default value of version.

sql_mode contains flags that influence the behavoir of the parser. Valid constants are MODE_PIPES_AS_CONCAT, MODE_ANSI_QUOTES, MODE_IGNORE_SPACE, MODE_NO_BACKSLASH_ESCAPES and MODE_HIGH_NOT_PRECEDENCE. The flags can be combined with the | operator. By default no flags are set.

client_capabilities is flag reflecting the capabilities of the client that issued the query. Currently the only flag accepted is CLIENT_MULTI_STATEMENTS, which controls whether several SQL statements can be parsed at once. By default no flags are set.

stmt_prepare_mode controls whether the statement being parsed is a prepared statement. The default is 0, however if this flag is set to 1, multiple SQL statements can not be parsed at once.

METHODS

Top

pos() and getPos() return the current character position as counted from the start of the string

getLine() and line() return the current line number.

getTokens() returns a reference to an array containing all tokens parsed so far.

LICENCE

Top

This file contains code derived from code Copyright (C) 2000-2006 MySQL AB

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License in the file named LICENCE for more details.


DBIx-MyParsePP documentation Contained in the DBIx-MyParsePP distribution.

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# Based on code Copyright (C) 2000-2006 MySQL AB

package DBIx::MyParsePP::Lexer;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(MODE_PIPES_AS_CONCAT MODE_ANSI_QUOTES MODE_IGNORE_SPACE MODE_NO_BACKSLASH_ESCAPES
		CLIENT_MULTI_STATEMENTS MODE_HIGH_NOT_PRECEDENCE);

use strict;

use DBIx::MyParsePP::Symbols;
use DBIx::MyParsePP::Charsets;
use DBIx::MyParsePP::Token;

use constant CTYPE_U	=> 01;		# Uppercase
use constant CTYPE_L	=> 02;		# Lowercase
use constant CTYPE_NMR 	=> 04;		# Numeral (digit)
use constant CTYPE_SPC	=> 010;		# Spacing character
use constant CTYPE_PNT	=> 020;		# Punctuation
use constant CTYPE_CTR	=> 040;		# Control character
use constant CTYPE_B	=> 0100;	# Blank
use constant CTYPE_X	=> 0200;	# heXadecimal digit

use constant LEXER_STRING		=> 0;
use constant LEXER_CHARSET		=> 1;
use constant LEXER_VERSION		=> 2;
use constant LEXER_SQL_MODE		=> 3;
use constant LEXER_OPTIONS		=> 4;
use constant LEXER_CLIENT_CAPABILITIES	=> 5;
use constant LEXER_STMT_PREPARE_MODE	=> 6;

use constant LEXER_PTR			=> 7;
use constant LEXER_TOK_START		=> 8;

use constant LEXER_TOKENS		=> 9;

use constant LEXER_YYLINENO		=> 10;
use constant LEXER_NEXT_STATE		=> 11;
use constant LEXER_IN_COMMENT		=> 12;
use constant LEXER_FOUND_SEMICOLON	=> 13;
use constant LEXER_SAFE_TO_CACHE_QUERY	=> 14;
use constant LEXER_SERVER_STATUS	=> 15;
use constant LEXER_CTYPE		=> 16;


use constant OPTION_FOUND_COMMENT	=> 1 << 15;
use constant CLIENT_MULTI_STATEMENTS	=> 1 << 16;
use constant SERVER_MORE_RESULTS_EXISTS	=> 8;
use constant NAMES_SEP_CHAR		=> '\377';


use constant MODE_PIPES_AS_CONCAT	=> 2;		# USE ME!
use constant MODE_ANSI_QUOTES		=> 4;
use constant MODE_IGNORE_SPACE		=> 8;
use constant MODE_MYSQL323		=> 65536;
use constant MODE_MYSQL40		=> MODE_MYSQL323 * 2;
use constant MODE_ANSI			=> MODE_MYSQL40 * 2;
use constant MODE_NO_AUTO_VALUE_ON_ZERO	=> MODE_ANSI * 2;
use constant MODE_NO_BACKSLASH_ESCAPES	=> MODE_NO_AUTO_VALUE_ON_ZERO * 2;
use constant MODE_STRICT_TRANS_TABLES	=> MODE_NO_BACKSLASH_ESCAPES * 2;
use constant MODE_STRICT_ALL_TABLES        	=> MODE_STRICT_TRANS_TABLES * 2;
use constant MODE_NO_ZERO_IN_DATE           	=> MODE_STRICT_ALL_TABLES * 2;
use constant MODE_NO_ZERO_DATE               	=> MODE_NO_ZERO_IN_DATE * 2;
use constant MODE_INVALID_DATES              	=> MODE_NO_ZERO_DATE * 2;
use constant MODE_ERROR_FOR_DIVISION_BY_ZERO 	=> MODE_INVALID_DATES * 2;
use constant MODE_TRADITIONAL                	=> MODE_ERROR_FOR_DIVISION_BY_ZERO * 2;
use constant MODE_NO_AUTO_CREATE_USER        	=> MODE_TRADITIONAL * 2;
use constant MODE_HIGH_NOT_PRECEDENCE        	=> MODE_NO_AUTO_CREATE_USER * 2;

my %state_maps;
my %ident_maps;

my %args = (
	string			=> LEXER_STRING,
	charset			=> LEXER_CHARSET,
	client_capabilities	=> LEXER_CLIENT_CAPABILITIES,
	stmt_prepare_mode	=> LEXER_STMT_PREPARE_MODE,
	sql_mode		=> LEXER_SQL_MODE,
	version			=> LEXER_VERSION
);

1;

sub new {
	my $class = shift;
	my $lexer = bless([], $class);

	my $max_arg = (scalar(@_) / 2) - 1;

	foreach my $i (0..$max_arg) {
		if (exists $args{$_[$i * 2]}) {
			$lexer->[$args{$_[$i * 2]}] = $_[$i * 2 + 1];
		} else {
			warn("Unkown argument '$_[$i * 2]' to DBIx::MyParsePP::Lexer->new()");
		}
        }

	$lexer->[LEXER_STRING]			= $lexer->[LEXER_STRING]."\0";
	$lexer->[LEXER_YYLINENO]		= 1;
	$lexer->[LEXER_TOK_START]		= 0;
	$lexer->[LEXER_PTR]			= 0;
	$lexer->[LEXER_NEXT_STATE]		= 'MY_LEX_START';

	$lexer->[LEXER_CLIENT_CAPABILITIES]	= CLIENT_MULTI_STATEMENTS if not defined $lexer->[LEXER_CLIENT_CAPABILITIES];
	$lexer->[LEXER_STMT_PREPARE_MODE]	= 0 if not defined $lexer->[LEXER_STMT_PREPARE_MODE];
	$lexer->[LEXER_SQL_MODE]		= 0 if not defined $lexer->[LEXER_SQL_MODE];	# CHECKME

	$lexer->[LEXER_VERSION]			= '50045' if not defined $lexer->[LEXER_VERSION];
	$lexer->[LEXER_CHARSET]			= 'ascii' if not defined $lexer->[LEXER_CHARSET]; # FIXME

	my $charset_uc = ucfirst($lexer->[LEXER_CHARSET]);
	eval('
				use DBIx::MyParsePP::'.$charset_uc.';
				$lexer->[LEXER_CTYPE] = $DBIx::MyParsePP::'.$charset_uc.'::ctype;
		');

	if ($@) {
		print STDERR "DBIx::MyParsePP::Lexer->new() failed: $@\n";
		return undef;
	}

	$lexer->[LEXER_TOKENS] 			= [];

	$lexer->init_state_maps($lexer->[LEXER_CHARSET]);

	return $lexer;
	
}

sub getLine {
	return $_[0]->[LEXER_YYLINENO];
}

sub line {
	return $_[0]->[LEXER_YYLINENO];
}

sub pos {
	return $_[0]->[LEXER_PTR];
}

sub getPos {
	return $_[0]->[LEXER_PTR];
}

sub getTokens {
	return $_[0]->[LEXER_TOKENS];
}

sub tokens {
	return $_[0]->[LEXER_TOKENS];
}

sub yyGet { return ord(substr($_[0]->[LEXER_STRING], $_[0]->[LEXER_PTR]++, 1)) };
sub yyGetLast { ord(substr($_[0]->[LEXER_STRING], $_[0]->[LEXER_PTR] - 1, 1)) };
sub yyPeek { ord(substr($_[0]->[LEXER_STRING], $_[0]->[LEXER_PTR], 1)) };
sub yyPeek2 { ord(substr($_[0]->[LEXER_STRING], $_[0]->[LEXER_PTR] + 1, 1)) };
sub yyUnget { $_[0]->[LEXER_PTR]-- };
sub yySkip { $_[0]->[LEXER_PTR]++ };
sub yyLength { ($_[0]->[LEXER_PTR] - $_[0]->[LEXER_TOK_START]) - 1 };

sub yylex {
	my $lexer = shift;
	my @res = $lexer->MYSQLlex();
	if (($res[0] eq '0') && ($res[1] eq '0')) {
		return (undef, '');	# EOF
	} else {
		my $token = DBIx::MyParsePP::Token->new(@res);
		push @{$lexer->[LEXER_TOKENS]}, $token;
		return ($res[0], $token);
	}
}

sub MYSQLlex {
	my $lexer = shift;

	my $string = $lexer->[LEXER_STRING];
	my $state_map = $state_maps{$lexer->[LEXER_CHARSET]};
	my $ident_map = $ident_maps{$lexer->[LEXER_CHARSET]};
	
	my $c = 0;
	my @token;
	my $result_state;
	my $state;

	$lexer->[LEXER_TOK_START] = $lexer->[LEXER_PTR];


	$state = $lexer->[LEXER_NEXT_STATE];
	$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_OPERATOR_OR_IDENT';

	my $char = substr($string, $lexer->[LEXER_PTR], 1);

	for (;;) {
		if (
			($state eq 'MY_LEX_OPERATOR_OR_IDENT') ||
			($state eq 'MY_LEX_START')
		) {
			for ($c = $lexer->yyGet(); $state_map->[$c] eq 'MY_LEX_SKIP'; $c = $lexer->yyGet()) {
				$lexer->[LEXER_YYLINENO]++ if $c == ord("\n");
			}
			$lexer->[LEXER_TOK_START] = $lexer->[LEXER_PTR] - 1;
			$state = $state_map->[$c];
		}
		
		if ($state eq 'MY_LEX_ESCAPE') {
			return ("NULL_SYM","NULL") if $lexer->yyGet() == ord('N');
		}
	
		if (
			($state eq 'MY_LEX_ESCAPE') ||
			($state eq 'MY_LEX_CHAR') ||
			($state eq 'MY_LEX_SKIP')
		) {
			if (
				($c == ord('-')) &&
				($lexer->yyPeek() == ord('-')) &&
				(
					($lexer->my_isspace($lexer->yyPeek2())) ||
					($lexer->my_iscntrl($lexer->yyPeek2()))
				)
			) {
				$state = 'MY_LEX_COMMENT';
				next;
			}
			$lexer->[LEXER_PTR] = $lexer->[LEXER_TOK_START];
			my $lex_str = substr($string, $lexer->[LEXER_PTR], 1);
			$c = $lexer->yyGet();
			
			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START' if $c != ord (')');

			if ($c == ord(',')) {
				$lexer->[LEXER_TOK_START] = $lexer->[LEXER_PTR];
			} elsif (($c == ord('?')) && (!$ident_map->[$lexer->yyPeek()])) {		# CHANGED
				return ("PARAM_MARKER","?");
			}
			return (chr($c), $lex_str);
		} elsif ($state eq 'MY_LEX_IDENT_OR_NCHAR') {
			if ($lexer->yyPeek() != ord("'")) {
				$state = 'MY_LEX_IDENT';
				next;
			}
			$lexer->[LEXER_TOK_START]++;
			$lexer->yySkip();
			my $lex_str;
			if (!defined ($lex_str = $lexer->get_text())) {
				$state = 'MY_LEX_CHAR';
				next;
			}
			return ('NCHAR_STRING',$lex_str);
		} elsif ($state eq 'MY_LEX_IDENT_OR_HEX') {
			if ($lexer->yyPeek() == ord("'")) {
				$state = 'MY_LEX_BIN_NUMBER';
				next;
			}
		} elsif ($state eq 'MY_LEX_IDENT_OR_BIN') {
			if ($lexer->yyPeek() == ord("'")) {
				$state = 'MY_LEX_BIN_NUMBER';
				next;
			}
		}

		if (
			($state eq 'MY_LEX_IDENT_OR_HEX') ||
			($state eq 'MY_LEX_IDENT_OR_BIN') ||
			($state eq 'MY_LEX_IDENT')
		) {
			my $start;
			## FIXME - multibyte

			for ($result_state = $c; $ident_map->[$c = $lexer->yyGet()]; $result_state |= $c) {};
			
			$result_state = $result_state & 0x80 ? 'IDENT_QUOTED' : 'IDENT';

			my $length = $lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START] - 1;
			$start = $lexer->[LEXER_PTR];

			if ($lexer->[LEXER_SQL_MODE] & MODE_IGNORE_SPACE) {
				for(; $state_map->[$c] eq 'MY_LEX_SKIP'; $c = $lexer->yyGet()) {};
			}

			if (
				($start == $lexer->[LEXER_PTR]) &&
				($c == ord('.')) &&
				($ident_map->[$lexer->yyPeek()])
			) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_IDENT_SEP';
			} else {
				$lexer->yyUnget();
				if (@token = $lexer->find_keyword($length, $c == ord('('))) {
					$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
					return @token;
				}
				$lexer->yySkip();
			} 
			my $lex_str = $lexer->get_token($length);

			if (
				(substr($lex_str,0,1) eq '_') &&
				(exists $DBIx::MyParsePP::Charsets::charsets->{substr($lex_str,1)})
			) {
				return ('UNDERSCORE_CHARSET', substr($lex_str,1));
			}

			return($result_state, $lex_str);
		} elsif ($state eq 'MY_LEX_IDENT_SEP') {
			my $lex_str = substr($string, $lexer->[LEXER_PTR], 1);
			$c = $lexer->yyGet();
			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_IDENT_START';
			if (!$ident_map->[$lexer->yyPeek()]) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
			}
			return (chr($c), $lex_str);
		} elsif ($state eq 'MY_LEX_NUMBER_IDENT') {
			while ($lexer->my_isdigit($c = $lexer->yyGet())) {} ;
			if (!$ident_map->[$c]) {
				$state = 'MY_LEX_INT_OR_REAL';
				next;
			}
			if (($c == ord('e')) || ($c == ord('E'))) {
				if (
					($lexer->my_isdigit($lexer->yyPeek())) ||
					($c = $lexer->yyGet() == ord('+')) ||
					($c == ord('-'))
				) {
					if ($lexer->my_isdigit($lexer->yyPeek())) {
						$lexer->yySkip();
						while ($lexer->my_isdigit($lexer->yyGet())) {};
						my $lex_str = $lexer->get_token($lexer->yyLength());
						return ('FLOAT_NUM', $lex_str);
					}
				}
				$lexer->yyUnget();
			} elsif (
				($c == ord('x')) &&
				($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START] == 2) &&
				(substr($string, $lexer->[LEXER_TOK_START], 1) eq '0')
			) {
				while($lexer->my_isxdigit($c = $lexer->yyGet())) {};
				if (($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START]) >= 4 && (!$ident_map->[$c])) {
					my $lex_str = $lexer->get_token($lexer->yyLength());
					$lex_str = substr($lex_str, 2);
					return ('HEX_NUM', $lex_str);
				}
				$lexer->yyUnget();
			} elsif (
				($c == ord('b')) &&
				($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START] == 2) &&
				(substr($string, $lexer->[LEXER_TOK_START], 1) eq '0')
			) {
				while($lexer->my_isxdigit($c = $lexer->yyGet())) {};
				if (($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START]) >= 4 && (!$ident_map->[$c])) {
					my $lex_str = $lexer->get_token($lexer->yyLength());
					$lex_str = substr($lex_str, 2);
					return ('BIN_NUM', $lex_str);
				}
				$lexer->yyUnget();
			}
		}

		if ($state eq 'MY_LEX_IDENT_START') {
			$result_state = 'IDENT';
			# FIXME multibyte
			for ($result_state = 0; $ident_map->[$c = $lexer->yyGet()]; $result_state |= $c) {};
			$result_state = $result_state & 0x80 ? 'IDENT_QUOTED' : 'IDENT';

			if (($c == ord('.')) && ($ident_map->[$lexer->yyPeek()])) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_IDENT_SEP';
			}

			my $lex_str = $lexer->get_token($lexer->yyLength());
			return($result_state, $lex_str);
		} elsif ($state eq 'MY_LEX_USER_VARIABLE_DELIMITER') {
			my $double_quotes = 0;
			my $quote_char = $c;
			$lexer->[LEXER_TOK_START] = $lexer->[LEXER_PTR];
			while ($c = $lexer->yyGet()) {
				my $var_length = $lexer->my_mbcharlen($c);
				if ($var_length == 1) {
					last if $c == ord(NAMES_SEP_CHAR);
					if ($c == $quote_char) {
						last if $lexer->yyPeek() != $quote_char;
						$c = $lexer->yyGet();
						$double_quotes++;
						next;
					}
				}
			}
			# MULTIBYTE!!

			my $lex_str;
				
			if ($double_quotes) {
				$lex_str = $lexer->get_quoted_token($lexer->yyLength() - $double_quotes, $quote_char);
			} else {
				$lex_str = $lexer->get_token($lexer->yyLength());
			}
		
			$lexer->yySkip() if $c == $quote_char;
			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
			return ('IDENT_QUOTED', $lex_str);
		} elsif ($state eq 'MY_LEX_INT_OR_REAL') {
			if ($c != ord ('.')) {
				my $lex_str = $lexer->get_token($lexer->yyLength());
				return $lexer->int_token($lex_str);
			}
		}

		if (
			($state eq 'MY_LEX_INT_OR_REAL') ||
			($state eq 'MY_LEX_REAL')
		) {
			while ($lexer->my_isdigit($c = $lexer->yyGet())) {};
			if (
				($c == ord('e')) ||
				($c == ord('E'))
			) {
				$c = $lexer->yyGet();
				if (
					($c == ord('+')) ||
					($c == ord('-'))
				) {
					$c = $lexer->yyGet();
				}
			
				if (!$lexer->my_isdigit($c)) {
					$state = 'MY_LEX_CHAR';
					next;
				}

				while ($lexer->my_isdigit($lexer->yyGet())) {};
			
				my $lex_str = $lexer->get_token($lexer->yyLength());
				return ('FLOAT_NUM', $lex_str);
			}
			
			my $lex_str = $lexer->get_token($lexer->yyLength());
			return ('DECIMAL_NUM', $lex_str);
		} elsif ($state eq 'MY_LEX_HEX_NUMBER') {
			$lexer->yyGet();
			while ($lexer->my_isdigit($lexer->yyGet())) {};
			my $length = $lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START];
			if (!($length & 1) || ($c != ord ("'"))) {
				return ('ABORT_SYM','ABORT_SYM');
			}
			$lexer->yyGet();
			my $lex_str = $lexer->get_token($length);
			$lex_str = substr($lex_str, 2, length($lex_str) - 3);
			return ('HEX_NUM', $lex_str);
		} elsif ($state eq 'MY_LEX_BIN_NUMBER') {
			$lexer->yyGet();
			while (($c = $lexer->yyGet()) == ord('0') || $c == ord ('1')) {};
			my $length = $lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START];
			if ($c != ord("'")) {
				return ('ABORT_SYM','ABORT_SYM');
			}
			$lexer->yyGet();
			my $lex_str = $lexer->get_token($length);
			$lex_str = substr($lex_str, 2, length($lex_str) - 3);
			return ('BIN_NUM', $lex_str);
		} elsif ($state eq 'MY_LEX_CMP_OP') {
			if (
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_CMP_OP') ||
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_LONG_CMP_OP')
			) {
				$lexer->yySkip();
			}
			if (@token = $lexer->find_keyword($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START], 0)) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
				return @token;				# ADDED
			}
			$state = 'MY_LEX_CHAR';
			next;
		} elsif ($state eq 'MY_LEX_LONG_CMP_OP') {
			if (
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_CMP_OP') ||
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_LONG_CMP_OP')
			) {
				$lexer->yySkip();
				if ($state_map->[$lexer->yyPeek()] eq 'MY_LEX_CMP_OP') {
					$lexer->yySkip();
				}
			}
			if (@token = $lexer->find_keyword($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START], 0)) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
				return @token;
			}
			$state = 'MY_LEX_CHAR';
			next;
		} elsif ($state eq 'MY_LEX_BOOL') {
			if ($c != $lexer->yyPeek()) {
				$state = 'MY_LEX_CHAR';
				next;
			}
			$lexer->yySkip();
			@token = $lexer->find_keyword(2, 0);
			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_START';
			return @token;
		} elsif ($state eq 'MY_LEX_STRING_OR_DELIMITER') {
			if ($lexer->[LEXER_SQL_MODE] & MODE_ANSI_QUOTES) {
				$state = 'MY_LEX_USER_VARIABLE_DELIMITER';
				next;
			}
		}
		
		if (
			($state eq 'MY_LEX_STRING_OR_DELIMITER') ||
			($state eq 'MY_LEX_STRING')
		) {
			my $lex_str;
			if (!defined ($lex_str = $lexer->get_text())) {
				$state = 'MY_LEX_CHAR';
				next;
			}
			return ('TEXT_STRING', $lex_str);
		} elsif ($state eq 'MY_LEX_COMMENT') {
			$lexer->[LEXER_OPTIONS] |= OPTION_FOUND_COMMENT;
			while (($c = $lexer->yyGet()) != ord("\n") && $c) {};
			$lexer->yyUnget();
			$state = 'MY_LEX_START';
			next;
		} elsif ($state eq 'MY_LEX_LONG_COMMENT') {
			if ($lexer->yyPeek() != ord('*')) {
				$state = 'MY_LEX_CHAR';
				next;
			}
			$lexer->yySkip();
			$lexer->[LEXER_OPTIONS] |= OPTION_FOUND_COMMENT;
			if ($lexer->yyPeek() == ord('!')) {
				$lexer->yySkip();
				my $version = $lexer->[LEXER_VERSION];
				$state = 'MY_LEX_START';
				if ($lexer->my_isdigit($lexer->yyPeek())) {
					$version = substr($string, $lexer->[LEXER_PTR], 5);
					$lexer->[LEXER_PTR] += 5;	# FIXME for version numbers different from 5 characters
				}

				if ($version <= $lexer->[LEXER_VERSION]){
					$lexer->[LEXER_IN_COMMENT] = 1;
					next;
				}
			}

			while (
				($lexer->[LEXER_PTR] != length($string) - 1) && 
				(
					($c = $lexer->yyGet() != ord('*')) ||
					($lexer->yyPeek() != ord('/'))
				)
			) {
				$lexer->[LEXER_YYLINENO]++ if $c == ord("\n");
			}
			
			$lexer->yySkip() if $lexer->[LEXER_PTR] != length($string) - 1;

			$state = 'MY_LEX_START';
			next;
		} elsif ($state eq 'MY_LEX_END_LONG_COMMENT') {
			if ($lexer->[LEXER_IN_COMMENT] && $lexer->yyPeek() == ord('/')) {
				$lexer->yySkip();
				$lexer->[LEXER_IN_COMMENT] = 0;
				$state = 'MY_LEX_START';
			} else {
				$state = 'MY_LEX_CHAR';
			}
			next;
		} elsif ($state eq 'MY_LEX_SET_VAR') {
			if ($lexer->yyPeek() != ord ('=')) {
				$state = 'MY_LEX_CHAR';
				next;
			}
			$lexer->yySkip();
			return('SET_VAR','SET_VAR');
		} elsif ($state eq 'MY_LEX_SEMICOLON') {
			if ($lexer->yyPeek()) {
				if (
					($lexer->[LEXER_CLIENT_CAPABILITIES] & CLIENT_MULTI_STATEMENTS) && 
					(!$lexer->[LEXER_STMT_PREPARE_MODE])
				) {
					$lexer->[LEXER_SAFE_TO_CACHE_QUERY] = 0;
					$lexer->[LEXER_FOUND_SEMICOLON] = $lexer->[LEXER_PTR];
					$lexer->[LEXER_SERVER_STATUS] |= SERVER_MORE_RESULTS_EXISTS;
					$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_END';
					return ('END_OF_INPUT','');
				}
				$state = 'MY_LEX_CHAR';
				next;
			}
		}
		
		if (
			($state eq 'MY_LEX_SEMICOLON') ||
			($state eq 'MY_LEX_EOL')
		) {
			if ($lexer->[LEXER_PTR] >= length($string) - 1) {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_END';
				return ('END_OF_INPUT','');
			}
			$state = 'MY_LEX_CHAR';
			next;
		} elsif ($state eq 'MY_LEX_END') {
			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_END';
			return (0,0);
		} elsif ($state eq 'MY_LEX_REAL_OR_POINT') {
			if ($lexer->my_isdigit($lexer->yyPeek())) {
				$state = 'MY_LEX_REAL';
			} else {
				$state = 'MY_LEX_IDENT_SEP';
				$lexer->yyUnget();
			}
			next;
		} elsif ($state eq 'MY_LEX_USER_END') {
			if (
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_STRING') ||
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_USER_VARIABLE_DELIMITER') ||
				($state_map->[$lexer->yyPeek()] eq 'MY_LEX_STRING_OR_DELIMITER')
			) {
				next;
			} elsif ($state_map->[$lexer->yyPeek()] eq 'MY_LEX_USER_END') {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_SYSTEM_VAR';
			} else {
				$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_HOSTNAME';
			}
			my $lex_str = substr($string, $lexer->[LEXER_PTR], 1);
			return ('@', $lex_str);
		} elsif ($state eq 'MY_LEX_HOSTNAME') {
			for ($c = $lexer->yyGet(); $lexer->my_isalnum($c) || $c == ord('.') || $c == ord('_') || $c == ord('$'); $c = $lexer->yyGet()) {};
			my $lex_str = $lexer->get_token($lexer->yyLength());
			return ('LEX_HOSTNAME', $lex_str);
		} elsif ($state eq 'MY_LEX_SYSTEM_VAR') {
			my $lex_str = substr($string, $lexer->[LEXER_PTR], 1);
			$lexer->yySkip();
			$lexer->[LEXER_NEXT_STATE] = $state_map->[$lexer->yyPeek()] eq 'MY_LEX_USER_VARIABLE_DELIMITER' ? 'MY_LEX_OPERATOR_OR_IDENT' : 'MY_LEX_IDENT_OR_KEYWORD';
			return ('@', $lex_str);
		} elsif ($state eq 'MY_LEX_IDENT_OR_KEYWORD') {
			for ($result_state = 0; $ident_map->[$c = $lexer->yyGet()]; $result_state |= $c) {};
			$result_state = $result_state & 0x80 ? 'IDENT_QUOTED' : 'IDENT';

			$lexer->[LEXER_NEXT_STATE] = 'MY_LEX_IDENT_SEP' if $c == ord('.');
	
			my $length = ($lexer->[LEXER_PTR] - $lexer->[LEXER_TOK_START]) - 1;
			return ('ABORT_SYM','ABORT_SYM') if $length == 0;
			if (@token = $lexer->find_keyword($length, 0)) {
				$lexer->yyUnget();
				return @token;
			}
			my $lex_str = $lexer->get_token($length);
			return ($result_state, $lex_str);
		}
	}
}

sub init_state_maps {

	my $lexer = shift;

	return if exists $state_maps{$lexer->[LEXER_CHARSET]};

	my @state_map;
	my @ident_map;

	for (my $i = 0; $i < 256; $i++) {
		if ($lexer->my_isalpha($i)) {
			$state_map[$i] = 'MY_LEX_IDENT';
		} elsif ($lexer->my_isdigit($i)) {
			$state_map[$i] = 'MY_LEX_NUMBER_IDENT';
		# FIXME MULTI-BYTE
		} elsif ($lexer->my_isspace($i)) {
			$state_map[$i] = 'MY_LEX_SKIP';
		} else {
			$state_map[$i] = 'MY_LEX_CHAR';
		}
	}

	$state_map[ord('_')] = $state_map[ord('$')] = 'MY_LEX_IDENT';
	$state_map[ord("'")] = 'MY_LEX_STRING';
	$state_map[ord('.')] = 'MY_LEX_REAL_OR_POINT';

	$state_map[ord('>')] = $state_map[ord('=')] = $state_map[ord('!')] = 'MY_LEX_CMP_OP';
	$state_map[ord('<')] = 'MY_LEX_LONG_CMP_OP';
	$state_map[ord('&')] = $state_map[ord('|')] = 'MY_LEX_BOOL';
	$state_map[ord('#')] = 'MY_LEX_COMMENT';
	$state_map[ord(';')] = 'MY_LEX_SEMICOLON';
	$state_map[ord(':')] = 'MY_LEX_SET_VAR';
	$state_map[0] = 'MY_LEX_EOL';
	$state_map[ord("\\")] = 'MY_LEX_ESCAPE';
	$state_map[ord('/')] = 'MY_LEX_LONG_COMMENT';
	$state_map[ord('*')] = 'MY_LEX_END_LONG_COMMENT';
	$state_map[ord('@')] = 'MY_LEX_USER_END';
	$state_map[ord('`')] = 'MY_LEX_USER_VARIABLE_DELIMITER';
	$state_map[ord('"')] = 'MY_LEX_STRING_OR_DELIMITER';

	for (my $i=0; $i < 256 ; $i++) {
		$ident_map[$i] = ($state_map[$i] eq 'MY_LEX_IDENT') || ($state_map[$i] eq 'MY_LEX_NUMBER_IDENT');
	}

	$state_map[ord('x')] = $state_map[ord('X')] = 'MY_LEX_IDENT_OR_HEX';
	$state_map[ord('b')] = $state_map[ord('B')] = 'MY_LEX_IDENT_OR_BIN';
	$state_map[ord('n')] = $state_map[ord('N')] = 'MY_LEX_IDENT_OR_NCHAR';

	$state_maps{$lexer->[LEXER_CHARSET]} = \@state_map;
	$ident_maps{$lexer->[LEXER_CHARSET]} = \@ident_map;
}


sub my_mbcharlen { 1 };

sub my_isalpha { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & (CTYPE_U | CTYPE_L) }

sub my_isalnum { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & (CTYPE_U | CTYPE_L | CTYPE_NMR) }

sub my_isxdigit { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & CTYPE_X }

sub my_isdigit { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & CTYPE_NMR }

sub my_isspace { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & CTYPE_SPC }

sub my_iscntrl { $_[0]->[LEXER_CTYPE]->[$_[1] + 1] & CTYPE_CTR }

sub get_text {
	my $lexer = shift;
	my $string = $lexer->[LEXER_STRING];
	my $sep = $lexer->yyGetLast();
	my $found_escape = 0;
	while ($lexer->[LEXER_PTR] != length($lexer->[LEXER_STRING]) - 1) {
		my $c = $lexer->yyGet();
		if (
			($c == ord("\\")) &&
			(!($lexer->[LEXER_SQL_MODE] & MODE_NO_BACKSLASH_ESCAPES))
		) {
			$found_escape = 1;
			return undef if $lexer->[LEXER_PTR] == length($lexer->[LEXER_STRING]);
			$lexer->yySkip();
		} elsif ($c == $sep) {
			if ($c == $lexer->yyGet()) {
				$found_escape = 1;
				next;
			} else {				
				$lexer->yyUnget();
			}
			
			my ($str, $end, $start);

			$str = $lexer->[LEXER_TOK_START] + 1;
			$end = $lexer->[LEXER_PTR] - 1;

			my $to;

			if (!$found_escape) {
				my $yytoklen = $end - $str;	# CHANGED
				if ($yytoklen > 0) {
					return substr($lexer->[LEXER_STRING], $str, $yytoklen);
				} else {
					return '';
				}
			} else {
				my $new_str = '';		# ADDED
				for ($to = $start; $str != $end; $str++) {
					if (
						(!($lexer->[LEXER_SQL_MODE] & MODE_NO_BACKSLASH_ESCAPES)) &&
						(substr($string, $str, 1) eq "\\") &&
						($str + 1 != $end)
					) {
						my $prev_str = substr($string, ++$str, 1);
						if ($prev_str eq 'n') {
							substr($new_str, $to++, 1) = "\n";
							next;
						} elsif ($prev_str eq 't') {
							substr($new_str, $to++, 1) = "\t";
							next;
						} elsif ($prev_str eq 'r') {
							substr($new_str, $to++, 1) = "\r";
							next;
						} elsif ($prev_str eq 'b') {
							substr($new_str, $to++, 1) = "\b";
							next;
						} elsif ($prev_str eq '0') {
							substr($new_str, $to++, 1) = "\0";
							next;
						} elsif ($prev_str eq 'Z') {
							substr($new_str, $to++, 1) = "\032";
							next;
						} elsif (
							($prev_str eq '_') ||
							($prev_str eq '%')
						) {
							substr($new_str, $to++, 1) = "\\";
							substr($new_str, $to++, 1) = $prev_str;	# Added
						} else {
							substr($new_str, $to++, 1) = $prev_str;
						}
					} elsif (substr($string, $str, 1) eq $sep) {
						substr($new_str, $to++, 1) = substr($string, $str++, 1);
					} else {
						substr($new_str, $to++, 1) = substr($string, $str, 1);
					}
				}
				return $new_str;
			}
			return substr($string, $start, ($to - $start));
		}
	}
	return undef;
}

sub get_token {
	my ($lexer, $length) = @_;
	$lexer->yyUnget();
	return substr($lexer->[LEXER_STRING], $lexer->[LEXER_TOK_START], $length);
}

use constant LONG_STR		=> "2147483647";
use constant LONG_LEN 		=> 10;
use constant SIGNED_LONG_STR	=> "-2147483648";
use constant LONGLONG_STR	=> "9223372036854775807";
use constant LONGLONG_LEN	=> 19;
use constant SIGNED_LONGLONG_STR => "-9223372036854775808";
use constant SIGNED_LONGLONG_LEN => 19;
use constant UNSIGNED_LONGLONG_STR => "18446744073709551615";
use constant UNSIGNED_LONGLONG_LEN => 20;

sub int_token {
	my ($lexer, $token) = @_;
	
	if (length($token) < LONG_LEN) {
		return ("NUM", $token);
	}

	my $neg = 0;

	if (substr($token, 0, 1) eq '+') {
		$token = substr($token, 1);
	} elsif (substr($token, 0, 1) eq '-') {
		$token = substr($token, 1);
		$neg = 1;
	}

	while (
		(substr($token, 0, 1) eq '0') &&
		(length($token) > 0)
	) {
		$token = substr($token, 1);
	}

	if (length($token) < LONG_LEN) {
		return ("NUM", $token);
	}

	my ($smaller, $bigger);
	my $cmp;

	if ($neg) {
		if (length($token) == LONG_LEN) {
			$cmp = SIGNED_LONG_STR + 1;
			$smaller = 'NUM';
			$bigger = 'LONG_NUM';
		} elsif (length($token) < SIGNED_LONGLONG_LEN) {
			return ('LONG_NUM', $token);
		} elsif (length($token) > SIGNED_LONGLONG_LEN) {
			return ('DECIMAL_SYM', $token);
		} else {
			$cmp = SIGNED_LONGLONG_STR + 1;
			$smaller = 'LONG_NUM';
			$bigger = 'DECIMAL_NUM';
		}
	} else {
		if (length($token) == LONGLONG_LEN) {
			$cmp = LONG_STR;
			$smaller = 'NUM';
			$bigger = 'LONG_NUM';
		} elsif (length($token) < LONGLONG_LEN) {
			return('LONG_NUM', $token);
		} elsif (length($token) > LONGLONG_LEN) {
			if (length($token) > UNSIGNED_LONGLONG_LEN) {
				return ('DECIMAL_NUM', $token);
			}
			$cmp = UNSIGNED_LONGLONG_STR;
			$smaller = 'ULONGLONG_NUM';
			$bigger = 'DECIMAL_NUM';
		} else {
			$cmp = LONGLONG_STR;
			$smaller = 'LONG_NUM';
			$bigger = 'ULONGLONG_NUM';
		}
	}
		
	return $token > $cmp ? ($bigger, $token) : ($smaller, $token);
}

sub find_keyword {
	my ($lexer, $length, $function) = @_;
	my $keyword = substr($lexer->[LEXER_STRING], $lexer->[LEXER_TOK_START], $length);

	my $symbol;
	if ($function) {
		$symbol = $DBIx::MyParsePP::Symbols::functions->{uc($keyword)};
		$symbol = $DBIx::MyParsePP::Symbols::symbols->{uc($keyword)} if not defined $symbol;
	} else {
		$symbol = $DBIx::MyParsePP::Symbols::symbols->{uc($keyword)};
	}

	return () if not defined $symbol;
	
	if (
		($symbol eq 'NOT_SYM') &&
		($lexer->[LEXER_SQL_MODE] & MODE_HIGH_NOT_PRECEDENCE)
	) {
		$symbol = 'NOT2_SYM';
	}

	if (
		($symbol eq 'OR_OR_SYM') &&
		($lexer->[LEXER_SQL_MODE] & MODE_PIPES_AS_CONCAT)
	) {
		$symbol = 'OR2_SYM';
	}

	return ($symbol, $keyword);
}

1;


__END__