/usr/local/CPAN/Combine/Combine/Lucene.pm


package Combine::Lucene;
use XML::LibXML;
sub update
{
  my($idxpath,$xwi) = @_;
  my $create = 0;
  unless (-e $idxpath) {
    $create = 1;
  }
  my $indexer = new LuceneIndexer($idxpath,$create,1000000);
  my $xml =  '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
     $xml .= "<documentCollection>\n";
     $xml .= Combine::XWI2XML::XWI2XML($xwi, 0, 0);
     $xml .= "</documentCollection>\n";

  my $parser = XML::LibXML->new();

  my $xmldoc = $parser->parse_string($xml);
#  printf "$xml\n";
  my @records = $xmldoc->getElementsByTagName('documentRecord');
  for my $rec (@records) {
    $recid = $rec->getAttribute('id');
#    printf "$recid\n";
    #delete exisit record
    $indexer->deleteDocuments("id",$recid);
    my $doc = $indexer->newDocument();
    $indexer->addField($doc,$indexer->newField("id",$recid,"YES","NOT_ANALYZED_NO_NORMS"));

    for my $child ($rec->getChildNodes()) {
      if ($child->nodeType == XML::LibXML::XML_ELEMENT_NODE)
      {
        my $chname = $child->nodeName();
#        printf "$chname\n";
        my $val, $subtag;
        if ($chname eq "modifiedDate") {
          $val = $child->textContent;
          $indexer->addField($doc,$indexer->newField("modifiedDate",$val,"YES","NOT_ANALYZED_NO_NORMS"));
                   
        } elsif ($chname eq "urls") {          
          for $subtag ($child->getChildNodes()) {
            if ($subtag->nodeType == XML::LibXML::XML_ELEMENT_NODE && $subtag->nodeName eq 'url') {
                $val = $child->textContent();
                $indexer->addField($doc,$indexer->newField("url",$val,"YES","NOT_ANALYZED_NO_NORMS"));
              }
          }
        } elsif ($chname eq "metaData") {
          for $subtag ($child->getChildNodes()) {
            if ($subtag->nodeType == XML::LibXML::XML_ELEMENT_NODE && $subtag->nodeName eq 'meta') {
              if ($subtag->getAttribute('name') eq 'title') {
                $val = $subtag->textContent();
                $indexer->addField($doc,$indexer->newField("title",$val,"NO","ANALYZED"));
              }
            }
          }
        } elsif ($chname eq "canonicalDocument") {
          for $subtag ($child->getChildNodes()) {
            if ($subtag->nodeType == XML::LibXML::XML_ELEMENT_NODE && $subtag->nodeName eq 'section') {
              $val = $subtag->textContent();           
              $indexer->addField($doc,$indexer->newField("canonicalDocument",$val,"NO","ANALYZED"));
            }
          }
        } elsif ($chname eq "links") {
        } elsif ($chname eq "property") {
          my $pname = $child->getAttribute('name');
          if ($pname eq 'country') {
            $val = $child->textContent();
            $indexer->addField($doc,$indexer->newField("country",$val,"YES","NOT_ANALYZED_NO_NORMS"));
          }
        }

      }
    }

    $indexer->addDocument($doc);
  }

  $indexer->close();
}
1;