/usr/local/CPAN/Data-Generate/Data/Generate.pm
#!/usr/bin/perl -w
################################################################################
# package Data::Generate
# Description: returns an SQL-Data generator object
# Design: during parsing we create following data structure internally:
# 'value_term': ascii string
# 'value_column': array of possible alternative choices for the value term
# 'value_chain': a chain of value columns
# 'chain_list': the generator itself
# output data : output data is retrieved by subsequent concatenation
# of value terms in a value chain. If more than one value chains are defined,
# then, based on weigthing, each chain at turn will be "asked" to return an
# output value, until an array of the requested cardinality is filled.
#
################################################################################
package Data::Generate;
use 5.006;
use strict;
use warnings;
use Carp;
use Parse::RecDescent;
use Date::Parse;
use Date::DayOfWeek;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
require Exporter;
our @ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use Data::Generate ':all';
our %EXPORT_TAGS = ( 'all' => [ qw(
parse
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw();
our $VERSION = '0.02';
$Data::Generate::Parser=undef;
$Data::Generate::current=undef;
$Data::Generate::ACTUAL_VALUE_COLUMN=undef;
$Data::Generate::VC_RANGE_REVERSE_FLAG=undef;
#-------------------------------------------------------------------------------
# Various constant definitions
#-------------------------------------------------------------------------------
$Data::Generate::vcol_type ={};
$Data::Generate::vcol_type->{year}->{lowlimit}=1970; # Unix 32 bit date
$Data::Generate::vcol_type->{year}->{highlimit}=2037; # Unix 32 bit date
$Data::Generate::vcol_type->{year}->{type}='year';
$Data::Generate::vcol_type->{month}->{lowlimit}=1;
$Data::Generate::vcol_type->{month}->{highlimit}=12;
$Data::Generate::vcol_type->{month}->{type}='month';
$Data::Generate::vcol_type->{day}->{lowlimit}=1;
$Data::Generate::vcol_type->{day}->{highlimit}=31;
$Data::Generate::vcol_type->{day}->{type}='day';
$Data::Generate::vcol_type->{hour}->{lowlimit}=0;
$Data::Generate::vcol_type->{hour}->{highlimit}=24;
$Data::Generate::vcol_type->{hour}->{type}='hour';
$Data::Generate::vcol_type->{minute}->{lowlimit}=0;
$Data::Generate::vcol_type->{minute}->{highlimit}=59;
$Data::Generate::vcol_type->{minute}->{type}='minute';
$Data::Generate::vcol_type->{second}->{lowlimit}=0;
$Data::Generate::vcol_type->{second}->{highlimit}=59;
$Data::Generate::vcol_type->{second}->{type}='second';
$Data::Generate::vcol_type->{fraction}->{type}='fraction';
$Data::Generate::vchain_type ={};
$Data::Generate::vchain_type->{DATE}->{type}='DATE';
$Data::Generate::vchain_type->{DATE}->{vcol_output_format}=
['%s',' %02d:','%02d:','%02d','.%s'];
# ['%04d','%02d','%02d',' %02d:','%02d:','%02d','.%s'];
$Data::Generate::vchain_type->{DATE}->{check_type}=sub {
no warnings "all";
my $input=shift;
(my $ss,my $mm, my $hh,my $day,my $month,my $year)= strptime($input);
return undef unless defined $year;
$year+=1900;
$month++;
my $precision=0;
$precision = $Data::Generate::current->{ct_precision}
if defined $Data::Generate::current->{ct_precision};
my $result=sprintf('%04d%02d%02d %02d:%02d:%02.'.$precision.'f',
$year, $month, $day,$hh,$mm,$ss);
return undef unless defined str2time($result);
return $result;
};
$Data::Generate::vchain_type->{DATE}->{output_format_fct}=sub {
my $input=shift;
return $input unless defined $Data::Generate::current->{ct_precision};
my $precision=$Data::Generate::current->{ct_precision};
my ( $date_string, $date_fraction) = ($input =~ /^(.+?)(\d{2}\.\d*)$/);
$date_fraction=sprintf('%02.'.$precision.'f',$date_fraction);
return $date_string.$date_fraction;
};
$Data::Generate::vchain_type->{DATE}->{subtype}->{'DATEWITHFRACTION'}
->{fraction_start_ix}=4;
$Data::Generate::vchain_type->{INTEGER}->{type}='INTEGER';
$Data::Generate::vchain_type->{INTEGER}->{check_type}=sub {
no warnings "all";
my $input=shift;
my $result=int($input);
return undef unless $result == $input;
return $result;
};
$Data::Generate::vchain_type->{FLOAT}->{output_format_fct}=sub {
my $input=shift;
$input =~ s/^\-0+\.0+$/0.0/;
$input =~ s/^\+//;
return eval($input);
};
$Data::Generate::vchain_type->{FLOAT}->{check_type}=sub {
# no warnings "all";
my $input=shift;
my $result=$input*1.0;
$input=eval($input);
$result=eval($result);
return undef unless $result == $input;
return $result;
};
$Data::Generate::vcol_type->{weekday}->{type}='weekday';
$Data::Generate::vcol_type->{weekday}->{term_list}=[qw{SUN MON TUE WED THU FRI SAT}];
################################################################################
# sub new
# Description:
# inital constructor for a list of value chains.
#
################################################################################
sub new
{
my ($class,$text) = @_;
my $self = {};
$self->{vchain_text} = $text;
$self->{vchain_length} = 0;
$self->{data_array} = [''];
$self->{vchain_array} = [];
$self->{vchain_hash} = {};
$self->{actual_vcol} = {};
bless $self, $class;
$self->reset_actual_vchain();
return $self;
}
################################################################################
# sub load_parser
# Description:
# create a Parse::RecDescent parser
# and load Data::Generate grammatics into.
#
################################################################################
sub load_parser
{
#------------------------------------------------------------------------------#
# START OF GRAMMATICS #
#------------------------------------------------------------------------------#
my $grammar = q {
start: varchar_type
| string_type
| date_type
| integer_type
| float_type
#------------------------------------------------------------------------------#
# STRING TYPE GRAMMATICS #
#------------------------------------------------------------------------------#
# different intialization, but for the rest see varchar type
string_type: ct_string vch_list
ct_string: /STRING/
{
$Data::Generate::current->{chain_type}='STRING';
}
#------------------------------------------------------------------------------#
# VARCHAR TYPE GRAMMATICS #
#------------------------------------------------------------------------------#
varchar_type: ct_varchar vch_list
ct_varchar: /(VC2|VC|VARCHAR2|VARCHAR)/ '(' /\d+/ ')'
{
$Data::Generate::current->{chain_type}='VARCHAR';
$Data::Generate::current->{ct_length}=$item[3];
}
vch_list: <leftop: value_chain /\|/ value_chain >
value_chain: value_col(s) vchain_weigth(?)
{
$Data::Generate::current->bind_actual_vchain();
1; }
vchain_weigth: /\(/ /\d+\.?\d*/ /\%\)/
{ $Data::Generate::current->{actual_vchain}->{weigth}=$item[2]; 1; }
value_col: vcr_integer vcol_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;
}
| vcol_range
| vcol_literal
| vcol_filelist
vcol_literal: vcol_lit_term vcol_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;
}
vcol_card: '{' /\d+/ '}'
{
$Data::Generate::current->{actual_vcol}->{quantifier}=$item[2];
1;
}
vcol_lit_term: /\'.+?\'/
{
$item[1] =~ /\'(.+?)\'/;
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},$1); 1;
}
vcol_range: vcr_start vcr_reverse(?) vcr_term(s) vcr_end vcol_card(?)
{
$Data::Generate::current->check_reverse_flag();
$Data::Generate::current->bind_actual_vcol();
1;}
vcr_start: /\[/
vcr_reverse: /\^/ { $Data::Generate::current->{actual_vcol}
->{reverse_flag}=1; }
vcr_term: /[^\s\]\[]/ '-' /[^\s\]\[]/
{
my @cmp = map(chr,
(
ord($item[1])..ord($item[3])
)
);
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},@cmp);
}
| '\\\\ '
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},' ');
}
| '\\\\' /./
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},$item[2]);
}
| /[^\]\[]/
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},$item[1]);
}
vcr_end: /\]/
vcr_integer: /\[/ /\d+/ '..' /\d+/ /\]/
{
warn "false integer order " if $item[4] < $item[2];
my @cmp = ($item[2]..$item[4]);
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},@cmp);
}
vcol_filelist: vcol_filelist_term vcol_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;
}
vcol_filelist_term: /\<\S+\>/
{
(my $file)= ($item[1] =~ /\<(\S+)\>/);
$Data::Generate::current->vcol_file_process($file);
1;
}
#------------------------------------------------------------------------------#
# INTEGER TYPE GRAMMATICS #
#------------------------------------------------------------------------------#
integer_type: ct_integer vch_int_list
ct_integer: /(INTEGER|INT)/ ct_int_length(?)
{
$Data::Generate::current->{chain_type}='INTEGER';
$Data::Generate::current->{ct_length}=9 # max integer value
unless (exists $Data::Generate::current->{ct_length});
if ($Data::Generate::current->{ct_length}>9)
{
warn " maximal integer length is 9 \n".
"Current Value: $Data::Generate::current->{ct_length} is too high"
.",will use length 9.";
$Data::Generate::current->{ct_length}=9;
}
}
ct_int_length: '(' /\d+/ ')'
{
$Data::Generate::current->{ct_length}=$item[2];
}
vch_int_list: <leftop: vch_int /\|/ vch_int >
vch_int: vchi_sign(?) vcol_int(s) vchain_weigth(?)
{
$Data::Generate::current->bind_actual_vchain();
1; }
vchi_sign:
/\+\/\-/
{
$Data::Generate::current->{actual_vchain}->{sign}->{'+'}++;
$Data::Generate::current->{actual_vchain}->{sign}->{'-'}++;
1; }
| /[+-]/
{
$Data::Generate::current->{actual_vchain}->{sign}->{$item[1]}++;
1; }
vcol_int: vcint_range
| vcint_literal
| vcol_filelist
vcint_range: /\[/ <leftop: vcint_term /\,/ vcint_term > /\]/ vcint_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;}
vcint_term: /\d+/ '-' /\d+/
{
my @cmp = (($item[1]+0)..($item[3]+0));
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},@cmp);
}
| vcint_lit_term
vcint_literal: vcint_lit_term vcint_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;
}
vcint_lit_term: /\d+/
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},($item[1]+0));
}
vcint_card: '{' /\d+/ '}'
{
$Data::Generate::current->{actual_vcol}->{quantifier}=$item[2];
1;
}
#------------------------------------------------------------------------------#
# FLOAT TYPE GRAMMATICS #
#------------------------------------------------------------------------------#
float_type: ct_float vch_float_list
ct_float: /FLOAT/ '(' /\d+/ ')'
{
$Data::Generate::current->{chain_type}='FLOAT';
$Data::Generate::current->{ct_length}=$item[3];
}
vch_float_list: <leftop: vch_float /\|/ vch_float >
vch_float: vchfloat_filelist
| vcol_float_int_part vcol_float_fraction vcol_float_exponent(?) vchain_weigth(?)
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='FLOATTOTAL';
$Data::Generate::current->bind_actual_vchain();
1; }
vchfloat_filelist: /\<\S+\>/
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='FLOATLIST';
(my $file)= ($item[1] =~ /\<(\S+)\>/);
$Data::Generate::current->vcol_file_process($file);
$Data::Generate::current->bind_actual_vcol();
$Data::Generate::current->bind_actual_vchain();
1;
}
vcol_float_int_part: vchi_sign(?) vcol_int(s)
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='FLOATINTPART';
$Data::Generate::current->bind_actual_vchain();
1; }
vcol_float_exponent: 'E' vcfloat_exp_sign(?) vcfloat_exp_term
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='FLOATEXP';
$Data::Generate::current->bind_actual_vchain();
1; }
vcfloat_exp_sign: /[+-]/
{
$Data::Generate::current->{actual_vchain}->{sign}->{$item[1]}++;
1; }
vcfloat_exp_term: vcfloatexp_lit_term
{
$Data::Generate::current->bind_actual_vcol();
1;
}
vcfloatexp_lit_term: /\d+/
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},($item[1]+0));
1;
}
vcol_float_fraction: '.' vcol_fraction
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='FLOATFRACTION';
$Data::Generate::current->bind_actual_vchain();
1;
}
#------------------------------------------------------------------------------#
# DATE TYPE GRAMMATICS #
#------------------------------------------------------------------------------#
date_type: ct_date ct_date_precision(?) vch_date_list
ct_date: /(DT|DATE)/
{
$Data::Generate::current->{chain_type}='DATE';
$Data::Generate::current->{ct_length}=17;
}
ct_date_precision: '(' /\d+/ ')'
{
$Data::Generate::current->{ct_precision}=$item[2];
if ($Data::Generate::current->{ct_precision}>14)
{
warn " maximal precision for fraction of seconds is 14 \n".
"Current Value: $Data::Generate::current->{ct_precision} is too high"
.",will use precision 14.";
$Data::Generate::current->{ct_precision}=14;
}
$Data::Generate::current->{ct_length}+=
$Data::Generate::current->{ct_precision}+1; # +1 because of dot sign
}
vch_date_list: <leftop: vch_date /\|/ vch_date >
vch_date: vcol_year vcol_month vcol_day vcol_time(?) vchain_weigth(?)
{
$Data::Generate::current->bind_actual_vchain();
1; }
| vchdate_filelist
vchdate_filelist: /\<\S+\>/
{
(my $file)= ($item[1] =~ /\<(\S+)\>/);
$Data::Generate::current->vcol_file_process($file);
$Data::Generate::current->bind_actual_vcol();
$Data::Generate::current->bind_actual_vchain();
1;
}
vcol_time: vcol_hour ':' vcol_min ':' vcol_sec vcol_date_fraction(?)
vcol_year: vcdate_range
{ $Data::Generate::current->bind_vcol_range('year'); 1;}
| vcdate_literal
{ $Data::Generate::current->bind_vcol_literal('year'); 1;}
vcol_month: vcmonth_range
{ $Data::Generate::current->bind_vcol_range('month'); 1;}
| vcmonth_literal
{
my $litval=shift(@{$Data::Generate::current->{actual_vcol}->{month_literal_values}});
$Data::Generate::current->{actual_vcol}->{literal_value}=$litval;
$Data::Generate::current->bind_vcol_literal('month'); 1;}
vcol_day: vcday_range
{ $Data::Generate::current->bind_vcol_range('day'); 1;}
| vcdate_literal
{ $Data::Generate::current->bind_vcol_literal('day'); 1;}
vcol_hour: vcdate_range
{ $Data::Generate::current->bind_vcol_range('hour'); 1;}
| vcdate_literal
{ $Data::Generate::current->bind_vcol_literal('hour'); 1;}
vcol_min: vcdate_range
{ $Data::Generate::current->bind_vcol_range('minute'); 1;}
| vcdate_literal
{ $Data::Generate::current->bind_vcol_literal('minute'); 1;}
vcol_sec: vcdate_range
{ $Data::Generate::current->bind_vcol_range('sec'); 1;}
| vcdate_literal
{ $Data::Generate::current->bind_vcol_literal('sec'); 1;}
vcol_date_fraction: '.' vcol_fraction
{
$Data::Generate::current->{actual_vchain}
->{chain_subtype}='DATEWITHFRACTION';
1;
}
vcdate_literal: /\d+/
{
$Data::Generate::current->{actual_vcol}->{literal_value}=$item[1];
1;
}
vcdate_range: /\[/ <leftop: vcdate_term /\,/ vcdate_term > /\]/
vcdate_term: /\d+/ '-' /\d+/
{ $Data::Generate::current->add_term_range($item[1],$item[3]);1; }
| /\d+/
{
push(@{$Data::Generate::current->{actual_vcol}->{value_term_list}},
$item[1]); 1;
}
vcday_range: /\[/ <leftop: vcday_term /\,/ vcday_term > /\]/
vcday_term: vcdate_term
| <leftop: vcday_literal /\-/ vcday_literal >
{
my $low =shift(@{$Data::Generate::current->{actual_vcol}->{weekday_index_values}});
my $high =shift(@{$Data::Generate::current->{actual_vcol}->{weekday_index_values}});
push(@{$Data::Generate::current->{actual_vcol}->{weekday_term_list}},
$low) unless defined $high;
$Data::Generate::current-> add_weekday_term_range($low,$high)
if defined $high;
1;
}
vcday_literal: /[a-zA-Z]+/
{
my @week=@{$Data::Generate::vcol_type->{weekday}->{term_list}};
my $ix=-1;
foreach my $wday_ix (0..$#week)
{
$ix=$wday_ix if $item[1] =~ /^$week[$wday_ix]/i;
}
die "cant process day term $item[1] " if $ix==-1;
push(@{$Data::Generate::current->{actual_vcol}->{weekday_index_values}}
,$ix);
1;
}
vcmonth_range: /\[/ <leftop: vcmonth_term /\,/ vcmonth_term > /\]/
vcmonth_term: <leftop: vcmonth_literal /\-/ vcmonth_literal >
{
my $low =shift(@{$Data::Generate::current->{actual_vcol}->{month_literal_values}});
my $high =shift(@{$Data::Generate::current->{actual_vcol}->{month_literal_values}});
push(@{$Data::Generate::current->{actual_vcol}->{value_term_list}},
$low) unless defined $high;
$Data::Generate::current->add_term_range($low,$high)
if defined $high;
1;
}
vcmonth_literal: /(\d+|[a-zA-Z]+)/
{
my $month=undef;
if ($item[1] =~ /\d+/)
{
$month =$item[1];
}
else
{
(undef,undef,undef,undef,$month,undef,undef) = Date::Parse::strptime($item[1].' 01');
die "Month $item[2] invalid " unless defined $month;
++$month;
}
push(@{$Data::Generate::current->{actual_vcol}->{month_literal_values}}
,$month);
1;
}
#------------------------------------------------------------------------------#
# FRACTION SUBTYPE GRAMMATICS #
# (RELEVANT FOR DATE AND FLOAT) #
#------------------------------------------------------------------------------#
vcol_fraction: vcol_fract(s)
vcol_fract: vcfract_range
| vcfract_literal
vcfract_range: /\[/ <leftop: vcfract_term /\,/ vcfract_term > /\]/ vcfract_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;}
vcfract_term: /\d+/ '-' /\d+/
{
my @cmp = (($item[1]+0)..($item[3]+0));
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},@cmp);
}
| vcfract_lit_term
vcfract_literal: vcfract_lit_term vcfract_card(?)
{
$Data::Generate::current->bind_actual_vcol();
1;
}
vcfract_lit_term: /\d+/
{
push(@{$Data::Generate::current->
{actual_vcol}->{value_term_list}},($item[1]+0));
}
vcfract_card: '{' /\d+/ '}'
{
$Data::Generate::current->{actual_vcol}->{quantifier}=$item[2];
1;
}
};
#------------------------------------------------------------------------------#
# END OF GRAMMATICS #
#------------------------------------------------------------------------------#
my $parser = Parse::RecDescent->new($grammar);
defined $parser or carp "couldn't load parser";
return $parser;
}
################################################################################
# Description: helper function
################################################################################
sub check_reverse_flag
{
my $self =shift;
return unless exists $self->{actual_vcol}->{reverse_flag};
$self->{actual_vcol}->{value_term_list}=
$self->get_value_column_reverse($self->{actual_vcol}->{value_term_list});
delete $self->{actual_vcol}->{reverse_flag};
}
################################################################################
# Description: helper function
################################################################################
sub check_range_order ($$)
{
my $min =shift;
my $max =shift;
if ($min >$max )
{
carp "false range order, $min > $max".
" will invert limits";
return [$max, $min];
}
return [$min, $max];
}
################################################################################
# sub vcol_file_process
# Description: read vcol_terms from file
#
################################################################################
sub vcol_file_process
{
my $self =shift;
my $file =shift;
open(VCOLFILE,$file) or carp "Couldnt open term file $file ";
my @cmp = (<VCOLFILE>);
close(VCOLFILE);
@cmp=('') if $#cmp==-1;
map(chomp($_),@cmp);
if (exists $Data::Generate::vchain_type->{$self->{chain_type}}
&& exists $Data::Generate::vchain_type->{$self->{chain_type}}->{check_type}
)
{
my @cmp2=();
foreach my $element (@cmp)
{
my $result=
&{$Data::Generate::vchain_type->{$self->{chain_type}}->{check_type}}
($element);
push(@cmp2,$result) if defined $result;
}
@cmp=@cmp2;
};
my $uniq={};
map($uniq->{$_}++,@cmp);
@cmp=(keys %$uniq);
push(@{$self->{actual_vcol}->{value_term_list}},@cmp);
}
################################################################################
# sub vcol_date_process
# Description: processing action for dates.
# At the end of each date production the three vcol date types year month day
# will be merged to a single one, so that date validity can be assessed,
# therefore instead of normally adding the date columns year and month,
# we keep them aside until the day column is processed.
#
################################################################################
sub vcol_date_process
{
my $self =shift;
if ($self->{actual_vcol}->{type} =~ /^(month|year)$/ )
{
my $type=$self->{actual_vcol}->{type};
$type.='_vcol';
$self->{$type} = $self->{actual_vcol};
return;
}
die "internal eror" if ($self->{actual_vcol}->{type} ne 'day' );
$self->{day_vcol} = $self->{actual_vcol};
$self->{actual_vcol}={};
my @value_term_list=();
my $weekdays={};
if (exists $self->{day_vcol}->{weekday_term_list})
{
foreach my $day_term (@{$self->{day_vcol}->{weekday_term_list}})
{
$weekdays->{$day_term}++
}
}
foreach my $year_term (@{$self->{year_vcol}->{value_term_list}})
{
foreach my $month_term (@{$self->{month_vcol}->{value_term_list}})
{
my $monthdays={};
foreach my $day_term (@{$self->{day_vcol}->{value_term_list}})
{
# convert 'char dates in numeric ones like '07'-> 7
# otherwise we cannot make unique value set
$day_term+=0;
$monthdays->{$day_term}++
}
my $first_month_weekday=dayofweek( 01,$month_term, $year_term );
foreach my $wkday_term (keys %{$weekdays})
{
my $day_term=$wkday_term-$first_month_weekday+1;
$day_term+=7 if $day_term<1;
while ($day_term<31)
{
$monthdays->{$day_term}++;
$day_term+=7;
}
}
foreach my $day_term (keys %{$monthdays})
{
my $date_term =
sprintf('%04d%02d%02d',$year_term, $month_term, $day_term);
push(@value_term_list,$date_term)
if defined str2time($date_term);
}
}
}
@value_term_list=sort(@value_term_list);
$self->{actual_vcol}->{value_term_list}=\@value_term_list;
$self->add_value_column($self->{actual_vcol}->{value_term_list});
delete $self->{year_vcol};
delete $self->{month_vcol};
delete $self->{day_vcol};
}
################################################################################
# sub vchain_date_fraction_process
# Description: reorganizes the internal vchain structure of date types with
# fraction values due to the possible presence of trailing zeros.
################################################################################
sub vchain_date_fraction_process
{
my $self =shift;
my $vchain_full=$self->{actual_vchain};
$self->reset_actual_vchain();
my $vchain_fraction={};
$vchain_fraction->{vcol_count}=$vchain_full->{vcol_count};
map($vchain_fraction->{vcol_hash}->{$_}->{value_column}=
$vchain_full->{vcol_hash}->{$_}->{value_column},
(0..$vchain_fraction->{vcol_count}));
my $fraction_start=
$Data::Generate::vchain_type->{DATE}->{subtype}->{'DATEWITHFRACTION'}
->{fraction_start_ix};
map_vchain_indexes($vchain_fraction,
sub { return undef if $_[0] <$fraction_start;
return $vchain_fraction->{vcol_count}-$_[0];
}
);
$vchain_fraction->{vcol_count}=$vchain_full->{vcol_count}-
$fraction_start;
my $vchain_data={};
$vchain_data->{weigth}=$vchain_full->{weigth};
my $vchain_weigth_list=$self->vchain_number_reprocess($vchain_fraction);
foreach my $vchain (@$vchain_weigth_list)
{
$vchain->{vcol_count}+=$fraction_start;
map_vchain_indexes($vchain,
sub { return $vchain->{vcol_count}-$_[0];
}
);
map($vchain->{vcol_hash}->{$_}->{value_column}=
$vchain_full->{vcol_hash}->{$_}->{value_column},
(0..$fraction_start-1));
}
# weigth has to be recalculated now.
calculate_vchain_list_weigth($vchain_weigth_list,$vchain_data->{weigth});
1;
}
################################################################################
# sub vchain_fraction_process
# Description: reorganizes the internal vchain structure of a fractional
# vchain part due to the possible presence of trailing zeros.
################################################################################
sub vchain_fraction_process
{
my $self =shift;
my $vchain_fraction =$self->{actual_vchain};
$self->reset_actual_vchain();
map_vchain_indexes($vchain_fraction,
sub {
return $vchain_fraction->{vcol_count}-$_[0];
}
);
my $vchain_data={};
$vchain_data->{weigth}=$vchain_fraction->{weigth};
my $vchain_weigth_list=$self->vchain_number_reprocess($vchain_fraction);
foreach my $vchain (@$vchain_weigth_list)
{
map_vchain_indexes($vchain,
sub { return $vchain->{vcol_count}-$_[0];
}
);
}
return $vchain_weigth_list;
1;
}
################################################################################
# sub merge_vchain_float_lists
# Description: merge int and float vchain lists together.(and add a '.' inbet.)
################################################################################
sub merge_vchain_float_lists
{
my $self =shift;
my $vchain_sign_list =shift;
my $vchain_integer_list =shift;
my $vchain_float_list =shift;
my $vchain_exp_list =shift;
my $vchain_merge_list =[];
my $vchain_zero =undef;
foreach my $vchain_integer (@$vchain_integer_list)
{
map_vchain_indexes($vchain_integer, sub { return 1+$_[0] ;});
$vchain_integer->{vcol_hash}->{0}->{value_column}=$vchain_sign_list;
$vchain_integer->{vcol_count}++;
}
if (@$vchain_exp_list ==0)
{
my $vchain_exp={};
$vchain_exp->{vcol_hash}->{0}->{value_column}=['0'];
$vchain_exp->{vcol_count}++;
push(@$vchain_exp_list,$vchain_exp);
}
foreach my $vchain_exp (@$vchain_exp_list)
{
map_vchain_indexes($vchain_exp, sub { return 1+$_[0] ;});
$vchain_exp->{vcol_hash}->{0}->{value_column}=['E'];
$vchain_exp->{vcol_count}++;
}
my $vchain_exp = $vchain_exp_list->[0];
foreach my $vchain_integer (@$vchain_integer_list)
{
foreach my $vchain_float (@$vchain_float_list)
{
foreach my $vchain_exp (@$vchain_exp_list)
{
my $vchain_merged={};
$vchain_merged->{vcol_count}=$vchain_integer->{vcol_count};
map($vchain_merged->{vcol_hash}->{$_}->{value_column}=
$vchain_integer->{vcol_hash}->{$_}->{value_column},
(0..$vchain_integer->{vcol_count}));
$vchain_merged->{vcol_count}++;
$vchain_merged->{vcol_hash}->{$vchain_merged->{vcol_count}}->{value_column}=['.'];
map($vchain_merged->{vcol_hash}->{$vchain_merged->{vcol_count}+1+$_}
->{value_column}=$vchain_float->{vcol_hash}->{$_}->{value_column},
(0..$vchain_float->{vcol_count}));
$vchain_merged->{vcol_count}+=$vchain_float->{vcol_count}+1;
# avoid double +/-0.0 , skip exp processing
if (($#{$vchain_merged->{vcol_hash}->{1}->{value_column}}==0)
&& ($vchain_merged->{vcol_hash}->{1}->{value_column}->[0]==0)
&& ($#{$vchain_merged->{vcol_hash}->{2}->{value_column}}==0)
&& ($vchain_merged->{vcol_hash}->{2}->{value_column}->[0] eq '.')
&& ($#{$vchain_merged->{vcol_hash}->{3}->{value_column}}==0)
&& ($vchain_merged->{vcol_hash}->{3}->{value_column}->[0]==0)
&& ($vchain_merged->{vcol_count}==3)
)
{
next if defined $vchain_zero;
$vchain_merged->{vcol_hash}->{0}->{value_column}=['+'];
$self->bind_vchain($vchain_merged);
push(@$vchain_merge_list,$vchain_merged);
$vchain_zero=$vchain_merged;
next;
}
map($vchain_merged->{vcol_hash}->{$vchain_merged->{vcol_count}+1+$_}
->{value_column}=$vchain_exp->{vcol_hash}->{$_}->{value_column},
(0..$vchain_exp->{vcol_count}));
$vchain_merged->{vcol_count}+=$vchain_exp->{vcol_count}+1;
$self->bind_vchain($vchain_merged);
push(@$vchain_merge_list,$vchain_merged);
}
}
}
return $vchain_merge_list;
1;
}
################################################################################
# sub vchain_date_fraction_process
# Description: reorganizes the internal vchain structure of date types with
# fraction values due to the possible presence of trailing zeros.
################################################################################
sub vchain_float_process
{
my $self =shift;
if ($self->{actual_vchain}->{chain_subtype} eq 'FLOATLIST' )
{
$self->bind_vchain($self->{actual_vchain});
$self->reset_actual_vchain();
return;
}
if ($self->{actual_vchain}->{chain_subtype} eq 'FLOATINTPART' )
{
$self->{FLOAT_CHAIN_START}=1+$#{$self->{vchain_array}};
$self->{FLOAT_CHAIN_SIGN}=[];
push (@{$self->{FLOAT_CHAIN_SIGN}},'+')
if (! exists $self->{actual_vchain}->{sign}
|| exists $self->{actual_vchain}->{sign}->{'+'} );
push (@{$self->{FLOAT_CHAIN_SIGN}},'-')
if ( exists $self->{actual_vchain}->{sign}
&& exists $self->{actual_vchain}->{sign}->{'-'} );
my $actual_vchain= $self->{actual_vchain};
$self->reset_actual_vchain();
$self->{FLOAT_INTEGER_PART}=$self->vchain_number_reprocess($actual_vchain);
return;
}
if ($self->{actual_vchain}->{chain_subtype} eq 'FLOATFRACTION' )
{
$self->{FLOAT_FRACTION_PART}=$self->vchain_fraction_process();
my $actual_vchain= $self->{actual_vchain};
$self->reset_actual_vchain();
return;
}
if ($self->{actual_vchain}->{chain_subtype} eq 'FLOATEXP' )
{
$self->{FLOAT_EXP_PART}=$self->vchain_integer_process();
return;
}
croak "Error in float parsing $self->{actual_vchain}->{chain_subtype} "
unless $self->{actual_vchain}->{chain_subtype} eq 'FLOATTOTAL';
# print "*********************".$self->{actual_vchain}->{weigth}."\n";
$self->{FLOAT_CHAIN_WEIGTH}=$self->{actual_vchain}->{weigth};
unless (exists $self->{FLOAT_EXP_PART})
{
$self->{actual_vchain}->{chain_subtype}= 'FLOATEXP';
push(@{$self->{actual_vcol}->{value_term_list}},0);
$self->bind_actual_vcol();
$self->{FLOAT_EXP_PART}=$self->vchain_integer_process();
$self->{zzzzFLOAT_EXP_PART}=$self->{FLOAT_EXP_PART};
}
foreach my $vchain_id ($self->{FLOAT_CHAIN_START}..$#{$self->{vchain_array}})
{
delete $self->{vchain_hash}->{$vchain_id};
pop(@{$self->{vchain_array}});
}
my $merge_list=$self->merge_vchain_float_lists($self->{FLOAT_CHAIN_SIGN},
$self->{FLOAT_INTEGER_PART},
$self->{FLOAT_FRACTION_PART},
$self->{FLOAT_EXP_PART});
calculate_vchain_list_weigth($merge_list,$self->{FLOAT_CHAIN_WEIGTH});
delete $self->{FLOAT_CHAIN_START};
delete $self->{FLOAT_CHAIN_SIGN};
delete $self->{FLOAT_CHAIN_WEIGTH};
delete $self->{FLOAT_INTEGER_PART};
delete $self->{FLOAT_FRACTION_PART};
delete $self->{FLOAT_EXP_PART};
1;
}
################################################################################
# sub vchain_integer_process
# Description: reorganizes the internal vchain structure of integer types.
# due to the possible presence of leading zeros.
################################################################################
# INT (9) +/- [3,0] [21,3,0] [4,0]
#
# + 0 0 4 -> converted to + 0 | + 3 0 4 | + 21 4| + 4
# - 3 21 0 | - 21 0 | - 3 0| -
# 3 | 3 | |
# | | |
#
# degr of freedom = 1 + 12 + 8 + 2 = 23
# -210','-214','-30','-300','-304','-3210','-3214','-330','-334','-34','-4','0','210','214','30','300',
# '304','3210','3214
sub vchain_integer_process
{
my $self =shift;
my $last_vchain=$self->{actual_vchain};
$self->reset_actual_vchain();
my $vchain_data={};
$vchain_data->{weigth}=$last_vchain->{weigth};
push (@{$vchain_data->{sign}},'+')
if (! exists $last_vchain->{sign} || exists $last_vchain->{sign}->{'+'} );
push (@{$vchain_data->{sign}},'-')
if ( exists $last_vchain->{sign} && exists $last_vchain->{sign}->{'-'} );
delete $last_vchain->{sign};
my $vchain_weigth_list=$self->vchain_number_reprocess($last_vchain);
foreach my $vchain (@$vchain_weigth_list)
{
next if $vchain->{vcol_count}==0
&& @{$vchain->{vcol_hash}->{0}->{value_column}}==1
&& $vchain->{vcol_hash}->{0}->{value_column}->[0]==0;
map_vchain_indexes($vchain,sub { return 1+$_[0];});
$vchain->{vcol_count}++;
@{$vchain->{vcol_hash}->{0}->{value_column}}=@{$vchain_data->{sign}};
}
# weigth has to be recalculated now.
calculate_vchain_list_weigth($vchain_weigth_list,$vchain_data->{weigth});
return $vchain_weigth_list;
}
################################################################################
# sub vchain_number_reprocess
# Description: reorganizes the internal vchain structure of numeric types.
# Due to the possible presence of leading or trailing zeros, we have to
# restructure the vcols in vchains to avoid duplicates (001, 01 problem).
# Other solutions are either too memory intensive (build the output values at
# vchain binding) or lead to incorrect cardinality calculation (eliminate
# duplicates at output data production);
################################################################################
# INT (9) +/- [3,0] [21,3,0] [4,0]
#
# + 0 0 4 -> converted to + 0 | + 3 0 4 | + 21 4| + 4
# - 3 21 0 | - 21 0 | - 3 0| -
# 3 | 3 | |
# | | |
#
# degr of freedom = 1 + 12 + 8 + 2 = 23
# -210','-214','-30','-300','-304','-3210','-3214','-330','-334','-34','-4','0','210','214','30','300',
# '304','3210','3214','330','334','34','4
sub vchain_number_reprocess
{
my $self =shift;
my $last_vchain =shift;
my $vcol_nonzero_list=[];
my $vcol_zero_list=[];
my $vchain_weigth_list=[];
while($last_vchain->{vcol_count}>=0)
{
my $vcol_list=
$last_vchain->{vcol_hash}->{0}->{value_column};
$vcol_nonzero_list=[];
$vcol_zero_list=[];
foreach my $vcol_value (@$vcol_list)
{
push (@$vcol_nonzero_list,$vcol_value) unless $vcol_value =~ /^0+$/;
push (@$vcol_zero_list,$vcol_value) if $vcol_value =~ /^0+$/;
}
if(@$vcol_nonzero_list >0)
{
$last_vchain->{vcol_hash}->{0}->{value_column}
=$vcol_nonzero_list;
$self->bind_vchain($last_vchain);
push(@$vchain_weigth_list,$self->{vchain_hash}
->{$#{$self->{vchain_array}}});
}
last unless(@$vcol_zero_list>0);
my $next_vchain={};
$next_vchain->{vcol_count}=$last_vchain->{vcol_count};
map($next_vchain->{vcol_hash}->{$_}->{value_column}=
$last_vchain->{vcol_hash}->{$_}->{value_column},
(0..$last_vchain->{vcol_count}));
map_vchain_indexes($next_vchain,sub {
return undef if $_[0]==0;
return $_[0]-1;
});
$next_vchain->{vcol_count}--;
$last_vchain=$next_vchain;
}
if (@$vcol_zero_list>0)
{
# add now 0 chain in place of +/-
$last_vchain->{vcol_hash}->{0}->{value_column}=[0];
$last_vchain->{vcol_count}++;
$self->bind_vchain($last_vchain);
push(@$vchain_weigth_list,$self->{vchain_hash}
->{$#{$self->{vchain_array}}});
}
return $vchain_weigth_list;
}
################################################################################
# Description: helper function. Calculate weigth for a group of vchains
################################################################################
sub calculate_vchain_list_weigth
{
my $vchain_list =shift;
my $weigth =shift;
my $card=
calculate_vchain_list_degrees_of_freedom($vchain_list);
map($_->{weigth}=$weigth,@$vchain_list);
map($_->{weigth}*=$_->{vchain_card},@$vchain_list);
map($_->{weigth}/=$card,@$vchain_list);
}
################################################################################
# Description: helper function.Change internal vcol indexes of a vchain
################################################################################
sub map_vchain_indexes
{
my $vchain =shift;
my $change_function =shift;
foreach my $index (0..$vchain->{vcol_count})
{
my $new_index=&$change_function($index);
next unless defined $new_index;
$vchain->{vcol_hash_tmp}->{$new_index}->{value_column}=
$vchain->{vcol_hash}->{$index}->{value_column};
}
$vchain->{vcol_hash}=$vchain->{vcol_hash_tmp};
delete $vchain->{vcol_hash_tmp};
}
################################################################################
# Description: helper function
################################################################################
sub check_input_limits
{
my $type =shift;
my $value =shift;
# no type defined, no ranges to check
return unless defined $type;
return unless exists $Data::Generate::vcol_type->{$type};
my $limit_check_hash=$Data::Generate::vcol_type->{$type};
if ((exists $limit_check_hash->{lowlimit}) &&
(defined $limit_check_hash->{lowlimit}))
{
croak " $limit_check_hash->{type} went out of range,".
" $value < $limit_check_hash->{lowlimit} "
if $value < $limit_check_hash->{lowlimit};
}
if ((exists $limit_check_hash->{highlimit}) &&
(defined $limit_check_hash->{highlimit}))
{
croak " $limit_check_hash->{type} went out of range,".
" $value > $limit_check_hash->{highlimit} "
if $value > $limit_check_hash->{highlimit};
}
}
################################################################################
# sub # vcol_add_term_range
# Description:
# add an expression (a..b) after parsing
################################################################################
sub add_weekday_term_range
{
my $self =shift;
my $min =shift;
my $max =shift;
my $act_vcol=$self->{actual_vcol};
if ($min>$max)
{
# index 6 is sunday
push(@{$self->{actual_vcol}->{weekday_term_list}},($min..6));
# index 0 is monday
push(@{$self->{actual_vcol}->{weekday_term_list}},(0..$max));
return;
}
push(@{$self->{actual_vcol}->{weekday_term_list}},($min..$max));
}
################################################################################
# sub # vcol_add_term_range
# Description:
# add an expression (a..b) after parsing
################################################################################
sub add_term_range
{
my $self =shift;
my $min =shift;
my $max =shift;
my $minmax=check_range_order($min,$max);
my $act_vcol=$self->{actual_vcol};
push(@{$self->{actual_vcol}->{value_term_list}},
($minmax->[0]..$minmax->[1]));
}
################################################################################
# sub # add_value_column_range
# Description:
# add an expression (a..b) after parsing
################################################################################
sub bind_vcol_range
{
my $self =shift;
my $type =shift;
my $act_vcol=$self->{actual_vcol};
foreach my $value (@{$self->{actual_vcol}->{value_term_list}})
{
check_input_limits($type,$value);
}
$act_vcol->{type}=$type;
$self->bind_actual_vcol();
}
################################################################################
# sub # add_value_column_range
# Description:
# add an expression (a..b) after parsing
################################################################################
sub bind_vcol_literal
{
my $self =shift;
my $type =shift;
my $act_vcol=$self->{actual_vcol};
check_input_limits($type,$self->{actual_vcol}->{literal_value});
$self->{actual_vcol}->{value_term_list}=
[$act_vcol->{literal_value}];
$act_vcol->{type}=$type;
$self->bind_actual_vcol();
}
################################################################################
# sub # sub set_actual_vchain_weigth
# Description:
# add weigth to actual value chain
################################################################################
sub reset_actual_vchain
{
my $self =shift;
$self->{actual_vchain} = {};
$self->{actual_vchain}->{vchain_length} = 0;
$self->{actual_vchain}->{weigth}=100;
}
################################################################################
# sub bind_actual_vcol
# Description: Postprocessing action.
# At the end of each value column production, add actual value column to the
# actual vchain. Afterwards reset actual_vcol to an empty hash
#
################################################################################
sub bind_actual_vcol
{
my $self =shift;
my $quantifier=1;
$quantifier=$self->{actual_vcol}->{quantifier}
if exists $self->{actual_vcol}->{quantifier};
if ((defined $self->{actual_vcol}->{type} ) &&
($self->{actual_vcol}->{type} =~ /^(day|month|year)$/ ))
{
$self->vcol_date_process();
}
elsif ((defined $self->{actual_vcol}->{type} ) &&
($self->{actual_vcol}->{type} eq 'sign' ))
{
$self->{sign_value_list}=$self->{actual_vcol}->{value_term_list};
$self->reset_actual_vcol();
}
else
{
$self->add_value_column($self->{actual_vcol}->{value_term_list})
foreach(1..$quantifier);
}
$self->{actual_vcol} = {};
$self->{actual_vcol}->{type} = undef;
}
sub reset_actual_vcol
{
my $self =shift;
$self->{actual_vcol} = {};
$self->{actual_vcol}->{type} = undef;
}
################################################################################
# Description: helper function. add vchain to generator object
################################################################################
sub bind_vchain
{
my $self =shift;
my $vchain =shift;
push(@{$self->{vchain_array}},$vchain);
$self->{vchain_hash}
->{$#{$self->{vchain_array}}}=$vchain;
}
################################################################################
# sub bind_actual_vchain
# Description: Postprocessing action.
# At the end of each chain production, add actual value chain to the chain list
# root structure, and afterwards reset actual_vchain to an empty hash
#
################################################################################
sub bind_actual_vchain
{
my $self =shift;
if ($self->{chain_type} eq 'INTEGER')
{
$self->vchain_integer_process();
return;
}
if ((exists $self->{actual_vchain}->{chain_subtype})
&& ($self->{actual_vchain}->{chain_subtype} eq 'DATEWITHFRACTION'))
{
$self->vchain_date_fraction_process();
return;
}
if ($self->{chain_type} eq 'FLOAT')
{
$self->vchain_float_process();
return;
}
$self->bind_vchain($self->{actual_vchain});
$self->reset_actual_vchain();
}
################################################################################
# sub add_value_column
# Description:
# add array of terms.
#
################################################################################
sub add_value_column
{
my $self = shift;
my $tmp_value_column = shift;
my $value_column = [];
my $vcol_maxlength=0;
my $ix=0;
my $unique={};
foreach my $value_term (@{$tmp_value_column})
{
my $vterm_length=length($value_term);
if (exists $self->{ct_length} && defined $self->{ct_length} &&
$self->{actual_vchain}->{vchain_length}+ $vterm_length>$self->{ct_length})
{
carp "Maximal length for type $self->{chain_type}($self->{ct_length}) "
."exceeded for \n$self->{vchain_text}\n"
."Element \'$value_term\' will be removed from output structures.\n"
."Please check your data creation rules\n";
next;
}
elsif ($unique->{$value_term}++>0)
{
carp "Duplicate entry \'$value_term\' found while building up internal structures.\n"
."Element \'$value_term\' will be removed from output structures.\n"
."Please check your data creation rules\n";
next;
}
else
{
push(@{$value_column},$value_term);
$vcol_maxlength =($vterm_length>$vcol_maxlength?$vterm_length:$vcol_maxlength);
$ix++;
}
};
$self->{actual_vchain}->{vchain_length}+=$vcol_maxlength;
if ($#{$value_column}==-1)
{
return 1;
}
if (exists $self->{actual_vchain}->{vcol_count})
{
$self->{actual_vchain}->{vcol_count}++
}
else
{$self->{actual_vchain}->{vcol_count}=0}
$self->{actual_vchain}->{vcol_hash}->{$self->{actual_vchain}->{vcol_count}}->{value_column} = $value_column;
}
################################################################################
# sub get_value_column_reverse
# Description: fill array in place with complementary ascii chars
#
################################################################################
sub get_value_column_reverse {
my $self = shift;
my $value_column = shift;
my @complement = map(chr,(0..255));
my $hash={};
$hash->{$_}++ foreach (@{$value_column});
$value_column=[];
foreach (@complement)
{
push(@$value_column,$_) unless $hash->{$_};
}
return $value_column;
}
################################################################################
# sub get_occupation_ratio
# Description:
# Based on input cardinality and degrees of freedom calculate
# the ratio of array elements to give / total number of elements
#
################################################################################
sub set_occupation_ratio
{
my $self = shift;
foreach my $actual_vchain (@{$self->{vchain_array}})
{
my $occupation_ratio = 0;
$occupation_ratio =
log($actual_vchain->{data_card}/$actual_vchain->{vchain_card})
/ ($actual_vchain->{vcol_count}+1);
$occupation_ratio =exp($occupation_ratio);
$actual_vchain->{vchain_occupation_ratio}= $occupation_ratio;
}
}
################################################################################
# sub calculate_occupation_levels
# Description:
# based on input cardinality calculate occupation levels.
#
################################################################################
sub calculate_occupation_levels
{
my $self = shift;
my $data_card = shift;
$self->check_input_card($data_card);
$self->set_occupation_ratio();
foreach my $actual_vchain (@{$self->{vchain_array}})
{
my $vchain_occupation_ratio =$actual_vchain->{vchain_occupation_ratio};
foreach (values %{$actual_vchain->{vcol_hash}})
{
my $vcol_degrees_of_freedom =$#{$_->{value_column}}+1;
if ($vchain_occupation_ratio ==1)
{ $_->{occupation_level} = $vcol_degrees_of_freedom }
else
{
$_->{occupation_level} =
int($vchain_occupation_ratio*$vcol_degrees_of_freedom)+1;
}
}
}
return ;
}
################################################################################
# sub get_degrees_of_freedom
# Description:
# get maximal cardinality
#
################################################################################
sub get_degrees_of_freedom
{
my $self = shift;
my $weigthed_card=undef;
foreach my $vchain_ref (@{$self->{vchain_array}})
{
confess " weigth undefined " unless defined $vchain_ref->{weigth} &&
defined $vchain_ref->{vchain_card};
if ($vchain_ref->{weigth} >0.0001)
{
$vchain_ref->{weigthed_card}=$vchain_ref->{vchain_card}/
$vchain_ref->{weigth};
}
else
{
$vchain_ref->{weigthed_card}=10000
}
$vchain_ref->{weigthed_card}=1 if $vchain_ref->{weigthed_card}<1;
$weigthed_card = $vchain_ref->{weigthed_card}
unless defined $weigthed_card;
$weigthed_card = $vchain_ref->{weigthed_card}
if $weigthed_card > $vchain_ref->{weigthed_card} ;
}
# workaround to handle integers numbers converted to float and back
if ( int($weigthed_card)+1-$weigthed_card <1e-9)
{
return int($weigthed_card)+1;
}
return int($weigthed_card);
}
################################################################################
# sub calculate_vchain_list_degrees_of_freedom
# Description:
# calculate maximal cardinality for a vchain
#
################################################################################
sub calculate_vchain_list_degrees_of_freedom
{
my $vchain_list = shift;
my $card=0;
foreach my $vchain_ref (@$vchain_list)
{
$vchain_ref->{vchain_card}=1;
foreach my $vcol_ref (values %{$vchain_ref->{vcol_hash}})
{ $vchain_ref->{vchain_card}*=$#{$vcol_ref->{value_column}}+1 }
$card+=$vchain_ref->{vchain_card};
}
return $card;
}
################################################################################
# sub calculate_degrees_of_freedom
# Description:
# calculate maximal cardinality of the generation rules
#
################################################################################
sub calculate_degrees_of_freedom
{
my $self = shift;
$self->{card}=
calculate_vchain_list_degrees_of_freedom($self->{vchain_array});
return $self->{card};
}
################################################################################
# sub calculate_weigth
# Description:
# normalize weigth so that total is 100%
#
################################################################################
sub calculate_weigth
{
my $self = shift;
my $weigth=0.0;
foreach my $vchain_ref (@{$self->{vchain_array}})
{
$weigth+= $vchain_ref->{weigth};
}
foreach my $vchain_ref (@{$self->{vchain_array}})
{
$vchain_ref->{weigth}/=$weigth
}
}
################################################################################
# sub check_input_card
# Description:
# ensures that degrees of freedom >= input_card
# generates a warning when input_card is bigger
#
################################################################################
sub check_input_card
{
my $self = shift;
my $data_card = shift;
if ($data_card > $self->{card})
{
carp "Input card ".$data_card." too big, maximal nr of ".
"values is $self->{card}.\nReturn only ".
$self->{card} ." values. \n";
$data_card=$self->{card};
}
foreach my $vchain_ref (@{$self->{vchain_array}})
{
$vchain_ref->{data_card}=$data_card;
$vchain_ref->{data_card}*=$vchain_ref->{weigth};
# $vchain_ref->{data_card}=int($vchain_ref->{data_card});
# $vchain_ref->{data_card}=1 if $vchain_ref->{data_card}==0;
if (int($vchain_ref->{data_card}) >$vchain_ref->{vchain_card})
{
carp "Either input card ".$data_card." too big or vchain weigth ".
"$vchain_ref->{weigth} too high.\nShould produce ".
$vchain_ref->{data_card}." values, can't produce more than ".
$vchain_ref->{vchain_card}." different values.\nReturn only ".
$vchain_ref->{vchain_card} ." values. \n";
$vchain_ref->{data_card}=$vchain_ref->{vchain_card};
}
}
}
################################################################################
# sub fisher_yates_shuffle
# Description: create a randomized array order. From Perl Cookbook
#
################################################################################
# fisher_yates_shuffle( \@array ) : generate a random permutation
# of @array in place
sub fisher_yates_shuffle {
my $array = shift;
my $i;
for ($i = @$array; --$i; ) {
my $j = int rand ($i+1);
next if $i == $j;
@$array[$i,$j] = @$array[$j,$i];
}
}
################################################################################
# sub is_valid
# Description: check if generator structure was built up successfully
#
################################################################################
sub is_valid {
my $self = shift;
return undef if @{$self->{vchain_array}} ==0;
1;
}
################################################################################
# sub get_data
# Description:
# get data
#
################################################################################
sub get_unique_data
{
my $self = shift;
my $data_card =shift;
$self->calculate_occupation_levels($data_card);
my $data =[];
my $chain_type=$self->{chain_type};
foreach my $actual_vchain (@{$self->{vchain_array}})
{
my $tmpdata =[''];
foreach my $value_column_index (0..$actual_vchain->{vcol_count})
{
my $value_column=$actual_vchain->{vcol_hash}->{$value_column_index};
my @tmp_value_column_copy=@{$value_column->{value_column}};
my @value_column_array =();
while(@value_column_array<$value_column->{occupation_level})
{
my $rnd_index=int(rand(@tmp_value_column_copy));
push(@value_column_array,splice(@tmp_value_column_copy,$rnd_index,1));
}
my $format=undef;
$format=
$Data::Generate::vchain_type->{$chain_type}->{vcol_output_format}
->[$value_column_index]
if ((exists $Data::Generate::vchain_type->{$chain_type})
&& (exists $Data::Generate::vchain_type
->{$chain_type}->{vcol_output_format}));
$tmpdata=vcol_chain($tmpdata, \@value_column_array,
$actual_vchain->{data_card},$format);
}
push(@$data,@$tmpdata);
}
# makes a random order
fisher_yates_shuffle($data);
# take away too much produced data
shift(@$data) while(@$data>$data_card);
map($_=&{$Data::Generate::vchain_type
->{$chain_type}->{output_format_fct}}($_),@$data)
if ((exists $Data::Generate::vchain_type->{$chain_type})
&& (exists $Data::Generate::vchain_type
->{$chain_type}->{output_format_fct}));
@$data = map(int($_),@$data) if $chain_type eq 'INTEGER';
@$data = sort(@$data);
my $uniq=[];
my $last='';
my $duplicates=0;
foreach my $item (@$data) {
if ($last eq $item)
{
$duplicates++;
next;
}
push(@$uniq, $item);
$last=$item;
}
carp "$duplicates duplicates found while generating ouput values.\n"
."Check syntax of statements" if $duplicates>0;
return $uniq;
}
################################################################################
# sub vcol_chain
# Description:
# make a cross product of two value columns and concatenate the values.
# if type is with formatted output prepare values with a pipe inbetween.
#
################################################################################
sub vcol_chain
{
my @original=@{shift()};
my @added =@{shift()};
my $card=shift;
my $format=shift;
$format= "%s" unless defined $format;
my @composed =();
foreach my $ele (@added)
{
foreach my $e2 (@original)
{
push(@composed,$e2.sprintf($format,$ele));
next unless defined $card;
return \@composed if(@composed>=$card);
}
}
return \@composed;
};
################################################################################
# sub parse
# Description:
# parse given text.
# return either an error or a Data::Generate object
#
################################################################################
sub parse($)
{
my ($text) = @_;
# check that parser is up and running
$Data::Generate::Parser=load_parser()
unless (defined $Data::Generate::Parser);
# create a new generator and set it as global for parse routines
$Data::Generate::ACTUAL_VALUE_COLUMN=undef;
$Data::Generate::VC_RANGE_REVERSE_FLAG=undef;
$Data::Generate::current= Data::Generate->new($text);
$Data::Generate::Parser->start($text);
$Data::Generate::current->is_valid() or
croak "Error in parsing, invalid generator for $text";
$Data::Generate::current->calculate_weigth();
$Data::Generate::current->calculate_degrees_of_freedom();
return $Data::Generate::current;
}
1;
__END__