XML::DTD::ContentModel - Perl module representing an element content


XML-DTD documentation Contained in the XML-DTD distribution.

Index


Code Index:

NAME

Top

XML::DTD::ContentModel - Perl module representing an element content model in an XML DTD

SYNOPSIS

Top

  use XML::DTD::ContentModel;

  my $cm = XML::DTD::ContentModel->new('(a,b*,(c|d)+)');
  print $cm->treestring;

DESCRIPTION

Top

XML::DTD::ContentModel is a Perl module representing an element content model in an XML DTD. The following methods are provided.

new
 my $cm = XML::DTD::ContentModel->new('(a,b*,(c|d)+)');

Construct a new XML::DTD::ContentModel object.

isa
 if (XML::DTD::ContentModel->isa($obj) {
 ...
 }

Test object type.

children
 my $objlst = $cm->children;

Return an array of child objects (subexpressions) which are also of type XML::DTD::ContentModel.

 my $objlst = $cm->children($children);

Set the array of child objects (subexpressions). Returns the new value.

element
 my $name = $cm->element;

Return the element name if the object has no subexpressions.

 my $name = $cm->element($eltname);

Set the element name. The element name should only be set if the object has no subexpressions. Returns the new value.

combineop
 my $op = $cm->combineop;

Return the combination operator (",", "|" or undef).

 my $op = $cm->combineop($combineop);

Set the combination operator (",", "|", or undef). Returns the new value.

occurop
 my $op = $cm->occurop;

Return the occurrence operator ("?", "+", "*", or undef).

 my $op = $cm->occurop($occurop);

Set the occurrence operator ("?", "+", "*", or undef). Returns the new value.

isatomic
 if ($cm->isatomic) {
 ...
 }

Determine whether the object is atomic (i.e. the model consists of a single element, ANY, EMPTY, or #PCDATA).

childnames
 my $nmlst = $cm->childnames;

Return an array of contained element names as an array reference.

string
 print $cm->string;

Return a string representation of the content model.

treestring
 print $cm->treestring;

Return a string representing the hierarchical structure of the model.

writexmlelts
 open(FH,'>file.xml');
 my $xo = new XML::Output({'fh' => *FH});
 $cm->writexmlelts($xo);

Write a component-specific part of the XML representation.

type
 my $typstr = $cm->type;

Determine the content specification type ('empty', 'any', 'mixed', or 'element').

dfa
 my $dfa = $cm->dfa;

Construct a Deterministic Finite Automaton to validate the content model (returns an XML::DTD::Automaton object). The approach is to use Thompson's construction of an NDFA from a regular expression, and then convert to Glushkov form via epsilon state elimination. Since SGML/XML content models are constrained to be unambiguous (or deterministic), the resulting automaton should be deterministic. For background details see:

* Anne Brüggemann-Klein and Derick Wood, The Validation of SGML Content Models, Mathematical and Computer Modelling, 25, 73-84, 1997. ftp://ftp.informatik.uni-freiburg.de/documents/papers/brueggem/podpJournal.ps * Dora Giammarresi, Jean-Luc Ponty, and Derick Wood, Glushkov and Thompson Constructions: A Synthesis. Tech. Report 98-17. Università Ca' Foscari di Venezia. http://www.mat.uniroma2.it/~giammarr/Research/Papers/gluth.ps.Z

SEE ALSO

Top

XML::DTD, XML::DTD::Element, XML::DTD::Automaton

AUTHOR

Top

Brendt Wohlberg <wohl@cpan.org>

COPYRIGHT AND LICENSE

Top

ACKNOWLEDGMENTS

Top

Peter Lamb <Peter.Lamb@csiro.au> fixed a bug in the _parse function, provided an improved implementation of _parenmatch, and modified accessor methods to allow setting of relevant values.


XML-DTD documentation Contained in the XML-DTD distribution.

package XML::DTD::ContentModel;

use XML::DTD::Automaton;
use XML::DTD::Error;

use 5.008;
use strict;
use warnings;
use Error qw(:try);

our @ISA = qw();

our $VERSION = '0.10';


# Constructor
sub new {
  my $proto = shift; # Class name or object reference
  my $cmstr = shift; # Content model string
  my $entmn = shift; # Reference to EntityManager object

  my $cls = ref($proto) || $proto;
  my $obj = ref($proto) && $proto;

  my $self;
  if ($obj) {
    # Called as a copy constructor
    $self = { %$obj };
    my $child;
    $self->{'chldlst'} = [];
    foreach $child ( @{$obj->{'chldlst'}} ) {
      push @{$self->{'chldlst'}}, $child->new;
    }
    bless $self, $cls;
  } else {
    # Called as the main constructor
    throw XML::DTD::Error("Constructor for XML::DTD::ContentModel called ".
			  "with undefined content model string")
      if (!defined $cmstr);
    $self = {
	     'chldlst' => [],    # List of child objects
	     'eltname' => undef, # Element name if leaf node of tree
	     'combnop' => undef, # Combine operator (choice or sequence)
	     'occurop' => undef  # Occurrence operator ('?', '*', or '+')
	    };
    bless $self, $cls;
    # Try to parse content model string
    try {
      $self->_parse($cls, $cmstr, $entmn);
    }
    # Catch any parse error exceptions
    catch XML::DTD::Error with {
      my $eo = shift;
      # If entity manager defined, and content model string contains
      # an entity reference, expand the entity reference and retry
      # parsing, otherwise just rethrow the exception. (This is an
      # ugly way of dealing with entity definitions not properly
      # handled by the parse method.)
      if (defined $entmn and $cmstr =~ /%[\w\.:\-_]+;/) {
	  $cmstr = $entmn->entitysubst($cmstr);
	  $self->_parse($cls, $cmstr, $entmn);
	} else {
	  $eo->throw();
	}
    };
  }
  return $self;
}


# Determine whether object is of this type
sub isa {
  my $cls = shift;
  my $r = shift;

  if (defined($r) && ref($r) eq $cls) {
    return 1;
  } else {
    return 0;
  }
}


# Return the list of child objects (subexpressions)
sub children {
  my $self = shift;

  if (@_) {
    my $chldlst = shift;
    $self->{'chldlst'} = $chldlst;
  }

  return $self->{'chldlst'}
}


# Return the element name if the object is the leaf node of the tree
sub element {
  my $self = shift;

  if (@_) {
    my $element = shift;
    $self->{'eltname'} = $element;
  }

  return $self->{'eltname'}
}


# Return the combination operator (i.e. "," or "|")
sub combineop {
  my $self = shift;

  if (@_) {
    my $combnop = shift;
    $self->{'combnop'} = $combnop;
  }

  return $self->{'combnop'};
}


# Return the occurrence operator (i.e. "?","+", or "*")
sub occurop {
  my $self = shift;

  if (@_) {
    my $occurop = shift;
    $self->{'occurop'} = $occurop;
  }

  return $self->{'occurop'};
}


# The object is atomic (i.e. the model consists of a single element,
# ANY, EMPTY, or #PCDATA)
sub isatomic {
  my $self = shift;

  return ((scalar @{$self->{'chldlst'}}) == 0);
}


# Return a list of contained elements
sub childnames {
  my $self = shift;
  my $names = shift;

  my $en;
  $names = {} if (!defined $names);
  if ($self->isatomic) {
    $en = $self->element;
    $names->{$en} = 1 if ($en ne 'ANY' and $en ne 'EMPTY' and
			  $en ne '#PCDATA');
  } else {
    my $child;
    foreach $child (@{$self->children}) {
      $child->childnames($names);
    }
  }
  return [sort keys %$names];
}


# Build a string representation of the content model
sub string {
  my $self = shift;

  my $type = $self->type;
  if ($self->isatomic and ($type eq 'mixed' or $type eq 'element')) {
    return "(" . $self->_string . ")";
  } else {
    return $self->_string;
  }
}


# Build a string representing the hierarchical structure of the model
sub treestring {
  my $self = shift;
  my $indent = shift;   # Indentation level
  my $showrefs = shift; # Flag selecting display of object references

  $indent = 0 if (!defined $indent);
  my $pre = '  ' x $indent;
  $pre .= "$self\t" if ($showrefs);
  my $cop = (defined $self->combineop)?$self->combineop:'';
  my $oop = (defined $self->occurop)?$self->occurop:'';
  my $cms = $self->string;
  my $str = sprintf("%-30s\t%s\t%s\n", $pre.$cms, $cop, $oop);
  my $child;
  foreach $child ( @{$self->{'chldlst'}} ) {
      $str .= $child->treestring($indent + 1, $showrefs);
  }
  return $str;
}


# Write component-specific part of the XML representation
sub writexmlelts {
  my $self = shift;
  my $xmlw = shift; # XML output object

  my $occur = (defined $self->{'occurop'} and $self->{'occurop'} ne '')?
    $self->{'occurop'}:undef;
  my $subop = (defined $self->{'combnop'} and $self->{'combnop'} ne '')?
    $self->{'combnop'}:undef;
  my $peref = (defined $self->{'peref'})?$self->{'peref'}:undef;
  if ($self->isatomic) {
    my $name = $self->element;
    my $label;
    if ($name eq '#PCDATA' or $name eq 'EMPTY' or $name eq 'ANY') {
      $label = 'type';
    } else {
      $label = 'name';
    }
    $xmlw->empty('child', {$label => $name, 'occur' => $occur,
			   'peref' => $peref});
  } else {
    $xmlw->open('children', {'occur' => $occur, 'subop' => $subop,
			     'peref' => $peref});
    my $c;
    foreach $c ( @{$self->{'chldlst'}} ) {
      $c->writexmlelts($xmlw);
    }
    $xmlw->close;
  }
}


# Determine the content specification type (empty, any, mixed, or element)
sub type {
  my $self = shift;

  if ($self->isatomic) {
    if ($self->element eq 'EMPTY') {
      return 'empty';
    } elsif ($self->element eq 'ANY') {
      return 'any';
    } elsif ($self->element eq '#PCDATA') {
      return 'mixed';
    } else {
      return 'element';
    }
  } else {
    my $oop = (defined $self->occurop)?$self->occurop:'';
    my $cop = (defined $self->combineop)?$self->combineop:'';
    if ($cop eq '|' and ($oop eq '' or $oop eq '*')) {
      my $chld = $self->children;
      my $c;
      foreach $c (@$chld) {
	return 'element' if (!$c->isatomic);
      }
      return 'element' if ($chld->[0]->element ne '#PCDATA');
      return 'mixed';
    } else {
      return 'element';
    }
  }
}


# Construct a DFA to validate the content model
sub dfa {
  my $self = shift;

  # The approach is to use Thompson's construction of an NDFA from a
  # regular expression, and then convert to Glushkov form via epsilon
  # state elimination. Since SGML/XML content models are constrained
  # to be unambiguous (or deterministic), the resulting automaton
  # should be deterministic. For background details see references
  # in documentation (below) for this method.

  # Construct an initial FSA object
  my $fsa = XML::DTD::Automaton->new;
  # Initial left index points to initial state
  my $ltn = 0;
  # Construct final state and set initial right index to its index
  my $rtn = $fsa->mkstate('Final', 1);
  # Call recursive FSA construction function
  $self->_buildfsa($fsa, $ltn, $rtn);
  # Eliminate epsilon transitions
  $fsa->epselim;
  # Remove unreachable states
  $fsa->rmunreach;
  # Ensure FSA is a DFA
  throw XML::DTD::Error("FSA for content model " . $self->string .
			" is not deterministic") if (!$fsa->isdeterministic);
  return $fsa;
}


# Parse content model string
#   Warning: This method is a mess, and should be completely rewritten
sub _parse {
  my $self = shift;
  my $class = shift; # Class identity for calling new method
  my $cmstr = shift; # Content model string
  my $entmn = shift; # Entity manager

  $cmstr =~ s/\s+//g; # Remove spaces
  ##print STDERR "PARSE: $cmstr\n";

  # Substitute entity values for references if entity is entire content model
  if (defined $entmn and
      ($cmstr =~ /^%([\w\.:\-_]+);$/ or
       $cmstr =~ /^\(%([\w\.:\-_]+);(\?|\*|\+)?\)$/ or
       $cmstr =~ /^\(%([\w\.:\-_]+);\)(\?|\*|\+)?$/ or
       $cmstr =~ /^\(\(%([\w\.:\-_]+);\)(\?|\*|\+)?\)$/)) {
    #my $paren = defined $2;
    #my $ocop = (defined $3)?$3:'';
    #$self->{'peref'} = $paren ? $2 : $1;
    $self->{'peref'} = $1;
    my $ocop = (defined $2)?$2:'';
    #my $paren = $cmstr =~ /\(/;
    my $paren = 1;
    my $entv = $entmn->pevalue($self->{'peref'});
    my $cmpnd = ($entv =~ /^[^\(]+\||\,[^\)]+$/);
    $cmstr = ($paren or $cmpnd)?"($entv$ocop)":"$entv$ocop" if (defined $entv);
    $cmstr =~ s/\s+//g; # Remove spaces
    ##print STDERR "SUBST: |$cmstr|$paren|$cmpnd|$entv|\n";
  }

  # Substitute entity values for references if content model consists
  # of a single entity with various configurations of parentheses and
  # occurence operators
  if (defined $entmn) {
    if ($cmstr =~ /^%([\w\.:\-_]+);$/) {
      $self->{'peref'} = $1;
      my $entv = $entmn->pevalue($self->{'peref'});
      $cmstr = "($entv)" if (defined $entv);
    } elsif ($cmstr =~ /^\(%([\w\.:\-_]+);(\?|\*|\+)?\)$/) {
      $self->{'peref'} = $1;
      my $ocop = (defined $2)?$2:'';
      my $entv = $entmn->pevalue($self->{'peref'});
      $cmstr = "($entv$ocop)" if (defined $entv);
    } elsif ($cmstr =~ /^\(%([\w\.:\-_]+);\)(\?|\*|\+)?$/) {
      $self->{'peref'} = $1;
      my $ocop = (defined $2)?$2:'';
      my $entv = $entmn->pevalue($self->{'peref'});
      $cmstr = "($entv)$ocop" if (defined $entv);
    } elsif ($cmstr =~ /^\(\(%([\w\.:\-_]+);\)(\?|\*|\+)?\)$/) {
      $self->{'peref'} = $1;
      my $ocop = (defined $2)?$2:'';
      my $entv = $entmn->pevalue($self->{'peref'});
      $cmstr = "(($entv)$ocop)" if (defined $entv);
    }

    $cmstr =~ s/\s+//g; # Remove spaces
  }

  # Temporary
  $self->{'cmstr'} = $cmstr;

  # Check whether model is a single element
  if ($cmstr =~ /^([A-Za-z_:][A-Za-z0-9-_:\.]*|#PCDATA)(\?|\+|\*)?$/ or
      $cmstr =~ /^\(([A-Za-z_:][A-Za-z0-9-_:\.]*|#PCDATA)(\?|\+|\*)?\)$/ or
      $cmstr =~ /^\(([A-Za-z_:][A-Za-z0-9-_:\.]*|#PCDATA)\)(\?|\+|\*)?$/) {
    # Just need to set element name and (optional) occurrence operator
    $self->{'eltname'} = $1;
    $self->{'occurop'} = $2;
    ##print STDERR "ATOMIC: |$cmstr|$1|".((defined $2)?$2:'')."\n";
    # Check whether model is a choice or sequence
  } elsif ($cmstr =~ /^\((.+)\)(\?|\+|\*)?$/) {
    # Should rewrite using _parenmatch in place of regex above
    # Set working string to content of parentheses and note occurrence operator
    $cmstr = $1;
    $self->{'occurop'} = $2;
    ##print STDERR "EXPR0: |$cmstr|\n";
    # Deal with first sequence/choice child expression
    my $expr;
    # Check whether string has no parentheses preceding the first
    # sequence or choice character
    if ($cmstr =~ /^([^\(\)\,\|]*)(\,|\|)/) { # Combine operator first
      $expr = $1;
      $self->{'combnop'} = $2;
      ##print STDERR "0CMBNOP: >>$2<< >>$cmstr<< >>$expr<<\n";
      $cmstr = $';
      throw XML::DTD::Error("Invalid content model: $cmstr", $self)
	  if ($expr eq '');
      push @{$self->{'chldlst'}}, $class->new($expr, $entmn);
    } else { # Parenthesis first
      my ($mat, $pst) = _parenmatch($cmstr);
      # Check whether parenthesis post-match consists of an optional
      # occurrence operator optionally followed by a combine operator
      ##print STDERR "PAREN: |$cmstr|$mat|$pst|\n";
      if ($pst =~ /^(\?|\+|\*)?(\,|\|)?/) {
	$expr = $mat.(defined($1)?$1:'');
	$self->{'combnop'} = $2;
	##print STDERR "1CMBNOP: >>$2<< >>$cmstr<< >>$expr<<\n";
	$cmstr = $';
	throw XML::DTD::Error("Invalid content model: $cmstr", $self)
	  if ($expr eq '');
	push @{$self->{'chldlst'}}, $class->new($expr, $entmn);
      } else {
	throw XML::DTD::Error("Invalid content model: $cmstr", $self);
	return;
      }
    }

    # Work through remaining sequence/choice child expressions
    while ($cmstr ne '') {
      ##print STDERR "EXPRn: |$cmstr|\n";
      # Check whether string has no parentheses preceding the first
      # sequence or choice character
      if ($cmstr =~ /^([^\(\)\,\|]*)(\,|\||$)/) { # Combine operator first
	$expr = $1;
	# Should check that combine op $2 is correct
	$cmstr = $';
	##print STDERR "2CMBNOP: >>$2<< >>$cmstr<< >>$expr<<\n";
	push @{$self->{'chldlst'}}, $class->new($expr, $entmn);
      } else { # Parenthesis first
	my ($mat, $pst) = _parenmatch($cmstr);
	# Check whether parenthesis post-match consists of an optional
	# occurrence operator followed by a combine operator
	if ($pst =~ /^(\?|\+|\*)?(\,|\||$)/) {
	  $expr = $mat.(defined($1)?$1:'');
	  # Should check that combine op $2 is correct
	  $cmstr = $';
	  ##print STDERR "3CMBNOP: >>$2<< >>$cmstr<< >>$expr<<\n";
	  push @{$self->{'chldlst'}}, $class->new($expr, $entmn);
	} else {
	  throw XML::DTD::Error("Invalid content model: $cmstr", $self);
	  return;
	}
      }
    }
  } else {
    throw XML::DTD::Error("Invalid content model: $cmstr", $self);
    return;
  }
}


# Find closing parenthesis matching first opening parenthesis in a
# string, and return a list consisting of the substrings including and
# after that closing parenthesis.
sub _parenmatch {
  my $str = shift;

  ##print STDERR "PARENMATCH: $str\n";
  my $level = 0;
  my $pos = 0;
  my $len = length $str;
  my $posl = index $str, '('; $posl = $len if ($posl < 0);
  my $posr = index $str, ')'; $posr = $len if ($posr < 0);
  if ($posl >= $len && $posr >= $len) {
    # String contains no parentheses
    return ('',$str);
  }
  do {
    if ($posl < $posr) {
      # A '(' is next
      $level++;
      $pos = $posl+1;
      $posl = index $str, '(', $pos;
      $posl = $len if ($posl < 0);
    } else { # $posl >= $posr
      # A ')' is next
      $level--;
      if ($level < 0) {
	throw XML::DTD::Error("Parenthesis matching error in string $str");
	return undef;
      }
      $pos = $posr+1;
      $posr = index $str, ')', $pos;
      $posr = $len if ($posr < 0);
    }
    # Drop out when the level returns to 0 or the string is exhausted
  } while ($level > 0 && $pos < $len);
  if ($level > 0) {
    throw XML::DTD::Error("Parenthesis matching error in string $str");
    return undef;
  }
  my $pre = substr $str, 0, $pos;
  my $pst = substr $str, $pos;
  return ($pre, $pst);
}


# Recursive part of function to build a string representation of the
# content model
sub _string {
  my $self = shift;

  my $str = '';
  if ($self->isatomic) {
    $str = $self->element;
  } else {
    my $strlst = [];
    my $child;
    foreach $child ( @{$self->{'chldlst'}} ) {
      push @$strlst, $child->_string;
    }
    my $cop = (defined $self->combineop)?$self->combineop:'';
    $str .= '(' . join($cop,@$strlst) . ')';
  }
  $str .= $self->occurop if (defined $self->occurop);
  return $str;
}


# Recursive part of function to build an FSA
sub _buildfsa {
  my $self = shift;
  my $fsa = shift; # FSA object
  my $ltn = shift; # Left (inbound) state index
  my $rtn = shift; # Right (outbound) state index

  # Content model expression is processed by building an FSA with
  # entry via state index $ltn and exit via state index $rtn. For each
  # subexpression, epsilon transitions are made to new entry and exit
  # states which are processed via a recursive call.

  if (defined $self->occurop and
      $self->occurop ne '') { # Need to deal with occurrence operator
    # Construct copy of this content model expression
    my $subexp = $self->new;
    # Remove occurrence operator from copy
    $subexp->{'occurop'} = undef;
    # Construct new left and right states labelled by the copied
    # content model expression
    my $ltn0 = $fsa->mkstate($subexp->string . '_lt');
    my $rtn0 = $fsa->mkstate($subexp->string . '_rt');
    if ($self->occurop eq '?') { # Occurrence operator is '?'
      # Construct relevant epsilon transitions
      $fsa->mktrans($ltn, $ltn0, '');
      $fsa->mktrans($rtn0, $rtn, '');
      $fsa->mktrans($ltn, $rtn, '');
    } elsif ($self->occurop eq '*') {  # Occurrence operator is '*'
      # Construct relevant epsilon transitions
      $fsa->mktrans($ltn, $ltn0, '');
      $fsa->mktrans($rtn0, $rtn, '');
      $fsa->mktrans($ltn, $rtn, '');
      $fsa->mktrans($rtn, $ltn0, '');
    } else {  # Occurrence operator is '+'
      # Construct relevant epsilon transitions
      $fsa->mktrans($ltn, $ltn0, '');
      $fsa->mktrans($rtn0, $rtn, '');
      $fsa->mktrans($rtn, $ltn0, '');
    }
    # Recursive call to deal with occurrence operator-free subexpression
    $subexp->_buildfsa($fsa, $ltn0, $rtn0);
  } else { # No occurrence operator
    if (defined $self->combineop and
	$self->combineop ne '') { # Need to deal with combine operator
      my ($chld, $ltn0, $rtn0);
      # Loop over each subexpression
      foreach $chld ( @{$self->{'chldlst'}} ) {
	# Construct new left and right states labelled by the current
	# content model subexpression
	$ltn0 = $fsa->mkstate($chld->string . '_lt');
	$rtn0 = $fsa->mkstate($chld->string . '_rt');
	if ($self->combineop eq ',') { # Combine operator is ','
	  # Construct epsilon transition from current left state to
	  # left state for current subexpression
	  $fsa->mktrans($ltn, $ltn0, '');
	  # Set current left state to right state for current subexpression
	  $ltn = $rtn0;
	} else { # Combine operator is '|'
	  # Construct epsilon transition from current left state to
	  # left state for current subexpression
	  $fsa->mktrans($ltn, $ltn0, '');
	  # Construct epsilon transition from current right state to
	  # right state for current subexpression
	  $fsa->mktrans($rtn0, $rtn, '');
	}
	# Recursive call to deal with current subexpression
	$chld->_buildfsa($fsa, $ltn0, $rtn0);
      }
      # If combine operator is ',', construct epsilon transition from
      # current right state to right state for current subexpression
      $fsa->mktrans($rtn0, $rtn, '') if ($self->combineop eq ',');
    } else { # No combine operator
      if ($self->isatomic) {
	# Expression is atomic, without occurrence operator
	$fsa->mktrans($ltn, $rtn, $self->element);
      } else {
	# Expression is not atomic
	if (scalar @{$self->children} == 1) {
	  # Expression is of the form ((a,b)); need to strip outer
	  # parentheses and recurse down a level
	  my $chld = $self->children->[0];
	  $chld->_buildfsa($fsa, $ltn, $rtn);
	} else {
	  # Should never reach here
	  throw XML::DTD::Error("Error converting content model ".
				$self->string." to an FSA", $self);
	}
      }
    }
  }
}


1;

__END__