Kasago - A Perl source code indexer


Kasago documentation Contained in the Kasago distribution.

Index


Code Index:

NAME

Top

Kasago - A Perl source code indexer

SYNOPSIS

Top

  my $kasago = Kasago->new({ dbh => $dbh });
  $kasago->init; # this creates the tables for you

  # import/update a directory
  $kasago->import($source, $dir);
  # delete a directory
  $kasago->delete($source);

  my @sources = $kasago->sources;
  my @files   = $kasago->files($source);
  my @tokens  = $kasago->tokens($source, $file);

  # search for a token
  foreach my $token ($kasago->search('orange')){
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }

  # search for a token, merging lines
  foreach my $hit ($kasago->search_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }  

  # search for tokens
  foreach my $token ($kasago->search_more($search)) {
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }

  # searh for tokens, merging lines
  foreach my $hit ($kasago->search_more_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }

DESCRIPTION

Top

Kasago is a module for indexing Perl source code. You can index source trees, and then query the index for symbols, strings, and documentation.

Kasago uses the PPI module to parse Perl and stores the index in a PostegreSQL database. Thus you need to have DBD::Pg installed and a database available for Kasago.

Why is this called Kasago? Because that's the Japanese name for a beautiful fish.

METHODS

Top

new

This is the constructor. It takes a DBI database handle as a parameter. This must be a valid dababase handle for a PostgreSQL database, constructed along the lines of 'my $dbh = DBI->connect("DBI:Pg:dbname=kasago", "", "")':

  my $kasago = Kasago->new({ dbh => $dbh });

delete

This deletes a source from the index:

  $kasago->delete($source);

files

Given a source, returns a list of the files indexed in that source:

  my @files   = $kasago->files($source);

import

This recursively imports a directory into Kasago. If the source is already indexed, the index is updated. You pass a source name and the directory path:

  $kasago->import($source, $dir);

init

This created the tables needed by Kasago in the database. You only need run this once. If you run this after initialisation, it will delete the index.

  $kasago->init;

This searches the index for an individual token:

    foreach my $token ($kasago->search('orange')){
      print $token->source . "/"
        . $token->file . "@"
        . $token->col . ","
        . $token->row . ": "
        . $token->line . "\n";
    }

search_merged

This searches the index for an individual token, but merges multiple tokens on the same line together:

    foreach my $hit ($kasago->search_merged($search)) {
      print $hit->source . "/"
        . $hit->file . "@"
        . $hit->row . ": "
        . $hit->line . "\n";
      foreach my $token (@{ $hit->tokens }) {
        print "  @" . $token->col . ": " . $token->value . "\n";
      }
    }  

search_more

This searches the index for tokens. "orange" would return all hits for orange, "orange leon" would return all hits for both "orange" and "leon". "orange -leon" shows all the hits for "orange" but without files that contain "leon", "+orange +leon" returns hits in files that contain both "orange" and "leon":

  foreach my $token ($kasago->search_more($search)) {
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }

search_more_merged

This searches the index for tokens as search_more, but merges multiple tokens on the same line together:

  foreach my $hit ($kasago->search_more_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }

sources

This returns a list of the sources currently indexed:

  my @sources = $kasago->sources;

tokens

Given a source and a file, returns a list of the tokens indexed:

  my @tokens  = $kasago->tokens($source, $file);

AUTHOR

Top

Leon Brocard <acme@astray.com>.

COPYRIGHT

Top


Kasago documentation Contained in the Kasago distribution.

package Kasago;
use strict;
use Carp qw(croak);
use DBI;
use File::Find::Rule;
use File::stat;
use File::Slurp;
use Kasago::Hit;
use Kasago::Token;
use Path::Class;
use PPI;
use Search::QueryParser;
use base qw( Class::Accessor::Chained::Fast );
__PACKAGE__->mk_accessors(qw( dbh ));
our $VERSION = '0.29';

sub new {
  my $class = shift;
  my $self  = $class->SUPER::new(@_);

  croak "No dbh passed to Kasago" unless $self->dbh;
  $self->dbh->{RaiseError} = 1;
  $self->dbh->{AutoCommit} = 0;

  return $self;
}

sub DESTROY {
  my $self = shift;
  $self->dbh->disconnect;
}

sub init {
  my $self = shift;
  my $dbh  = $self->dbh;
  eval {

    eval { $dbh->do("select 1 from tokens"); };
    if ($dbh->errstr) {
      $dbh->rollback;
    } else {
      $dbh->do("
DROP TABLE tokens;
DROP TABLE lines;
DROP TABLE words;
DROP TABLE files;
DROP TABLE sources;
");
    }

    $dbh->do("
CREATE TABLE sources (
    source_id SERIAL PRIMARY KEY,
    source TEXT UNIQUE
) WITHOUT OIDS;
");

    $dbh->do("
CREATE TABLE files (
    file_id SERIAL PRIMARY KEY,
    source_id INTEGER REFERENCES sources ON DELETE CASCADE,
    file TEXT,
    UNIQUE (source_id, file)
) WITHOUT OIDS;
CREATE INDEX source_id_index ON files(source_id);
");

    $dbh->do("
CREATE TABLE words (
    word_id SERIAL PRIMARY KEY,
    word TEXT UNIQUE
) WITHOUT OIDS;
");

    $dbh->do("
CREATE TABLE lines (
    line_id SERIAL PRIMARY KEY,
    file_id INTEGER REFERENCES files ON DELETE CASCADE,
    row INTEGER,
    line TEXT,
    UNIQUE (file_id, row)
) WITHOUT OIDS;
CREATE INDEX file_id_index ON lines(file_id);
CREATE INDEX row_index ON lines(row);
");

    $dbh->do("
CREATE TABLE tokens (
    token_id SERIAL PRIMARY KEY,
    line_id INTEGER REFERENCES lines ON DELETE CASCADE,
    word_id INTEGER REFERENCES words ON DELETE CASCADE,
    col INTEGER
) WITHOUT OIDS;
CREATE INDEX line_id_index ON tokens(line_id);
CREATE INDEX word_id_index ON tokens(word_id);
");

    $dbh->commit;
  };
  die $@ if $@ && $@ !~ /already exists/;
}

my %word_cache;

sub import {
  my ($self, $source, $dir) = @_;
  return unless ref $self;    # This isn't Exporter, you know
  my $dbh = $self->dbh;

  $self->_delete($source);

  my $source_id =
    $dbh->selectcol_arrayref("SELECT source_id FROM sources WHERE source = ?",
    {}, $source)->[0];
  unless ($source_id) {
    $dbh->do("INSERT INTO sources (source) VALUES (?)", {}, $source);
    $source_id = $dbh->last_insert_id(undef, undef, "sources", undef);
  }

  foreach my $file (File::Find::Rule->new->file->in($dir)) {
    my $rel     = file($file)->relative($dir);
    my $file_id =
      $dbh->selectcol_arrayref(
      "SELECT file_id FROM files WHERE source_id = ? AND file = ?",
      {}, $source_id, $rel)->[0];
    unless ($file_id) {
      $dbh->do("INSERT INTO files (source_id, file) VALUES (?, ?)",
        {}, $source_id, $rel);
      $file_id = $dbh->last_insert_id(undef, undef, "files", undef);
    }

    my @lines = read_file($file);
    my $row   = 1;
    foreach my $line (@lines) {
      chomp $line;
      $dbh->do("INSERT INTO lines (file_id, row, line) VALUES (?, ?, ?)",
        {}, $file_id, $row++, $line);
    }

    my @line_ids = @{
      $dbh->selectcol_arrayref(
        "SELECT line_id FROM lines WHERE file_id = ? ORDER by row",
        {}, $file_id)
      };

    my @tokens = $self->_tokenise_perl($file);
    foreach my $token (@tokens) {
      my $word_id = $word_cache{ $token->value };
      unless ($word_id) {
        $word_id =
          $dbh->selectcol_arrayref("SELECT word_id FROM words WHERE word = ?",
          {}, $token->value)->[0];
        unless ($word_id) {
          $dbh->do("INSERT INTO words (word) VALUES (?)", {}, $token->value);
          $word_id = $dbh->last_insert_id(undef, undef, "words", undef);
        }
        $word_cache{ $token->value } = $word_id;
      }
      my $line_id = $line_ids[ $token->row - 1 ];
      $dbh->do("INSERT INTO tokens (line_id, word_id, col) VALUES (?, ?, ?)",
        {}, $line_id, $word_id, $token->col);
    }
  }

  $dbh->commit;
  $dbh->do("
    ANALYZE tokens;
    ANALYZE lines;
    ANALYZE words;
    ANALYZE files;
    ANALYZE sources;
    ");
}

sub _tokenise_perl {
  my ($self, $file) = @_;
  my @tokens;
  my $document = PPI::Document->new($file);
  return unless $document;
  $document->index_locations;
  foreach my $node (@{ $document->find('PPI::Statement::Package') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->namespace);
  }
  foreach my $node (@{ $document->find('PPI::Token::Symbol') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->canonical);
  }
  foreach my $node (@{ $document->find('PPI::Token::Number') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->content);
  }
  foreach my $node (@{ $document->find('PPI::Token::Word') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->content);
  }
  foreach my $node (@{ $document->find('PPI::Token::Quote') || [] }) {
    my ($line, $col) = @{ $node->location };
    my $left    = "";
    my $content = $node->content;
    my $split   = qr/(\s+|\.|'|")/;
    foreach my $word (split /$split/, $content) {
      if ($word !~ /^$split$/) {
        push @tokens,
          Kasago::Token->_new_from_node($node, $word,
          [ $line, $col + length($left) ]);
      }
      $left .= $word;
    }
  }
  foreach my $node (@{ $document->find('PPI::Token::Comment') || [] }) {
    my ($line, $col) = @{ $node->location };
    my $left  = "";
    my $split = qr/(\s+|\.)/;
    foreach my $word (split /$split/, $node->content) {
      if ($word !~ /^$split$/) {
        push @tokens,
          Kasago::Token->_new_from_node($node, $word,
          [ $line, $col + length($left) ]);
      }
      $left .= $word;
    }
  }
  foreach my $node (@{ $document->find('PPI::Token::Pod') || [] }) {
    my ($line, $col) = @{ $node->location };
    foreach my $content (split "\n", $node->content) {
      my $left  = "";
      my $split = qr/(\s+|\.)/;
      foreach my $word (split /$split/, $content) {
        if ($word !~ /^$split$/) {
          push @tokens,
            Kasago::Token->_new_from_node($node, $word,
            [ $line, $col + length($left) ]);
        }
        $left .= $word;
      }
      $line++;
    }
  }
  return @tokens;
}

sub delete {
  my ($self, $source) = @_;
  $self->_delete($source);
  $self->dbh->commit;
}

sub _delete {
  my ($self, $source) = @_;
  $self->dbh->do("DELETE FROM sources WHERE source = ?", undef, $source);
}

sub sources {
  my $self = shift;
  return @{ $self->dbh->selectcol_arrayref("SELECT source FROM sources") };
}

sub files {
  my ($self, $source) = @_;
  return @{
    $self->dbh->selectcol_arrayref("
SELECT file FROM sources
NATURAL INNER JOIN files
WHERE source=?
ORDER BY file;
",
      {},
      $source)
    };
}

sub tokens {
  my ($self, $source, $file) = @_;
  return @{
    $self->dbh->selectcol_arrayref("
SELECT word FROM files 
NATURAL INNER JOIN words
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
WHERE source_id=(SELECT source_id from sources WHERE source=?) 
AND file=? ORDER BY word;
",
      {}, $source, $file)
    };
}

sub search {
  my ($self, $word) = @_;
  my $sth = $self->dbh->prepare("
SELECT source, file, row, col, line FROM words
NATURAL INNER JOIN files
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
NATURAL INNER JOIN sources
WHERE word = ?
ORDER by source, file, row, col;
");
  $sth->execute($word);
  my @tokens;
  while (my ($source, $file, $row, $col, $line) = $sth->fetchrow_array) {
    push @tokens,
      Kasago::Token->new(
      {
        source => $source,
        file   => $file,
        row    => $row,
        col    => $col,
        value  => $word,
        line   => $line,
      }
      );
  }
  return @tokens;
}

sub search_merged {
  my ($self, $word) = @_;
  return $self->_merge($self->search($word));
}

sub _merge {
  my ($self, @all_tokens) = @_;
  my @hits;
  my $prev;
  my @tokens;

  foreach my $token (@all_tokens) {
    my $now = $token->source . ':' . $token->file . ':' . $token->row;
    if (defined $prev && $prev ne $now) {
      push @hits,
        Kasago::Hit->new(
        {
          source => $tokens[0]->source,
          file   => $tokens[0]->file,
          row    => $tokens[0]->row,
          line   => $tokens[0]->line,
          tokens => [@tokens],
        }
        );
      @tokens = ();
    }
    push @tokens, $token;
    $prev = $now;
  }
  push @hits,
    Kasago::Hit->new(
    {
      source => $tokens[0]->source,
      file   => $tokens[0]->file,
      row    => $tokens[0]->row,
      line   => $tokens[0]->line,
      tokens => [@tokens],
    }
    )
    if @tokens;
  return @hits;
}

sub _search_more_file {
  my ($self, $term) = @_;
  my $word = $term->{value};
  $word = $self->dbh->quote($word);
  return qq{
SELECT DISTINCT(file_id) FROM words
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
WHERE word = $word};
}

sub search_more {
  my ($self, $words) = @_;
  my $dbh = $self->dbh;

  my $qp    = Search::QueryParser->new;
  my $query = $qp->parse($words);
  return unless $query;

  #use YAML; warn Dump $query;

  my (@union, @plus, @minus, @words);
  foreach my $term (@{ $query->{""} }) {
    push @union, $self->_search_more_file($term);
    push @words, $term->{value};
  }

  foreach my $term (@{ $query->{"+"} }) {
    push @plus,  $self->_search_more_file($term);
    push @words, $term->{value};
  }

  foreach my $term (@{ $query->{"-"} }) {
    push @minus, $self->_search_more_file($term);
  }

  my $subsql = "SELECT DISTINCT(file_id) FROM files WHERE file_id IN (";
  if (@union) {
    $subsql .= '(' . join(' UNION ', map { $_ = "($_)" } @union) . ')';
  }
  if (@plus) {
    $subsql .=
      ' INTERSECT (' . join(' INTERSECT ', map { $_ = "($_)" } @plus) . ')';
  }
  if (@minus) {
    $subsql .= ' EXCEPT (' . join(' UNION ', map { $_ = "($_)" } @minus) . ')';
  }
  $subsql .= ')';
  $subsql =~ s/WHERE  AND/WHERE /;
  $subsql =~ s/IN \( INTERSECT/IN ( /;

  #  die "$subsql;\n";
  #  warn "$subsql;\n";

  #  my @file_ids = @{$self->dbh->selectcol_arrayref($sql)};
  #  warn "@file_ids";

  #  my $file_ids = join(',', @file_ids);
  $words = join(',', map { $_ = $dbh->quote($_) } @words);

  my $sql = qq{
SELECT source, file, row, col, word, line FROM tokens
NATURAL INNER JOIN files
NATURAL INNER JOIN words
NATURAL INNER JOIN lines
NATURAL INNER JOIN sources
WHERE
file_id IN ($subsql) AND
word_id IN (SELECT word_id FROM words WHERE word IN ($words))
ORDER by source, file, row, col;
};

  #  warn $sql;
  my $sth = $dbh->prepare($sql);
  $sth->execute();
  my @tokens;
  while (my ($source, $file, $row, $col, $word, $line) = $sth->fetchrow_array) {
    push @tokens,
      Kasago::Token->new(
      {
        source => $source,
        file   => $file,
        row    => $row,
        col    => $col,
        value  => $word,
        line   => $line,
      }
      );
  }
  return @tokens;
}

sub search_more_merged {
  my ($self, $search) = @_;
  return $self->_merge($self->search_more($search));
}

1;

__END__