Daizu::Revision - functions for loading revisions from Subversion


Daizu documentation Contained in the Daizu distribution.

Index


Code Index:

NAME

Top

Daizu::Revision - functions for loading revisions from Subversion

DESCRIPTION

Top

These functions are used to load metadata about revisions (and the file path changes made in them) from the Subversion into the PostgreSQL database.

FUNCTIONS

Top

The following functions are available for export from this module. None of them are exported by default.

load_revision($cms, $desired_revnum)

Load information about new revisions, up to $desired_revnum. It starts from the revision after the last one which was loaded, and is idempotent (so if you try to load the same revision twice there will be no change).

If $desired_revnum is not defined, loads up to the most recent revision in the repository.

This can also be called as a method on a Daizu object.

known_branches($db)

Return a reference to a hash of known branches. The keys are the paths, and the values are the ID numbers found in the branch table.

Dies if it can't find a branch with the path trunk, because that indicates a broken database.

branch_and_path($branches, $path)

Return a list of two values, the ID number and path of the branch which a file at $path would be in, whether or not it actually exists. The path should be relative to the root of the repository, for example trunk/foo.html. It doesn't mater whether $path starts with a forward slash.

Returns nothing if the path is not in any branch, in which case Daizu CMS will simply ignore it.

file_guid($db, $branch_id, $path, $revnum)

Returns a reference to a hash of information about the GUID for the file in branch $branch_id at $path in revision $revnum, or undef if there is/was no such file.

The hash will contain the following keys:

id

GUID ID.

is_dir

True iff the associated file is a directory.

uri

The GUID URI (usually starting with tag:). This will be the custom GUID URI if overridden by a daizu:guid property.

custom_uri

True if a daizu:guid property has overridden the automatically generated GUID URI.

first_revnum

The number of the first revision in which $path was used for this file in this branch.

last_revnum

The number of the last revision in which $path was used for this file in this branch, or undef if it is still being used in the most recently loaded revision.

COPYRIGHT

Top


Daizu documentation Contained in the Daizu distribution.
package Daizu::Revision;
use warnings;
use strict;

use base 'Exporter';
our @EXPORT_OK = qw(
    load_revision
    known_branches branch_and_path file_guid
);

use SVN::Core;
use SVN::Ra;
use SVN::Repos;
use SVN::Delta;
use Carp qw( croak );
use Carp::Assert qw( assert DEBUG );
use Daizu::Util qw(
    like_escape
    validate_uri db_datetime
    db_row_exists db_select db_insert db_update transactionally
    mint_guid get_subversion_properties
);

sub load_revision
{
    my ($cms, $desired_rev) = @_;
    croak "bad revision number r$desired_rev"
        if defined $desired_rev && $desired_rev < 1;

    return transactionally($cms->{db}, \&_load_revision_txn,
                           $cms, $desired_rev);
}

sub _load_revision_txn
{
    my ($cms, $desired_rev) = @_;
    my $db = $cms->{db};
    my $ra = $cms->{ra};

    my $latest_rev = $ra->get_latest_revnum;
    $desired_rev = $latest_rev
        unless defined $desired_rev;
    croak "can't load up to r$desired_rev, latest revision is r$latest_rev"
        if $desired_rev > $latest_rev;

    my $last_known_rev = db_select($db, revision => {}, 'max(revnum)');
    $last_known_rev ||= 0;
    assert($last_known_rev <= $latest_rev) if DEBUG;

    # Return quickly if there's nothing to do.
    return $last_known_rev
        if $last_known_rev >= $desired_rev;

    my $branches = known_branches($db);
    my $trunk_id = $branches->{trunk};

    for my $revnum (($last_known_rev + 1) .. $desired_rev) {
        my @modified;
        my (@added, @copied, @deleted);
        my $date;

        # Gather information about the changes in this revision.
        # The database is only updated later, after the callback is finished,
        # because we might need to get other information from $ra, and that's
        # not allowed while get_log() is running.
        $ra->get_log('/', $revnum, $revnum, 0, 1, 1, sub {
            my ($paths, undef, undef, $rev_date) = @_;
            $date = $rev_date;

            while (my ($full_path, $changes) = each %$paths) {
                my $action = $changes->action;

                # Modified files don't affect identity, we just need to record
                # their paths so that we can check for property changes that
                # might affect the GUID URI.  Only changes on trunk affect it.
                if ($action eq 'M') {
                    next unless $full_path =~ s!^/trunk/!! && $full_path ne '';
                    push @modified, $full_path;
                    next;
                }

                my ($branch_id, $path) = branch_and_path($branches, $full_path);

                # Ignore files which live outside branches, except when we
                # see a new branch being created, in which case make a note
                # of where it is in the 'branch' table.  We recognize a new
                # branch as a copy from an existing branch to a new location
                # outside any existing branches.
                if (!defined $branch_id) {
                    my $from = $changes->copyfrom_path;
                    next unless defined $from;
                    my $to = $full_path;
                    $_ =~ s!^/!! for $from, $to;
                    next unless exists $branches->{$from};
                    $path = '';
                    $branch_id = db_insert($db, 'branch', path => $to);
                    $branches->{$to} = $branch_id;
                }

                if ($action ne 'D') {       # add or replace
                    my $from_full_path = $changes->copyfrom_path;
                    if (defined $from_full_path) {
                        croak "Error in revision $revnum: file $full_path copied from root directory"
                            if $from_full_path eq '/';
                        my ($from_branch_id, $from_path) =
                            branch_and_path($branches, $from_full_path);
                        croak "Error in revision $revnum: file $full_path copied from unknown branch"
                            unless defined $from_branch_id;

                        push @copied, {
                            full_path => $full_path,
                            path => $path,
                            branch_id => $branch_id,
                            from_full_path => $from_full_path,
                            from_path => $from_path,
                            from_branch_id => $from_branch_id,
                            from_rev => $changes->copyfrom_rev,
                        };
                    }
                    else {
                        push @added, [ $branch_id, $path, $full_path ];
                    }
                }

                if ($action ne 'A') {       # delete or replace
                    push @deleted, [ $branch_id, $path ];
                }
            }
        });

        _add_revision($db, $revnum, $date);
        _revision_guid_modifications($ra, $db, $trunk_id, $revnum, \@modified)
            if @modified;
        _revision_guid_path_changes($cms, $ra, $db, $trunk_id, $revnum,
                                    \@added, \@copied, \@deleted)
            if @added || @copied || @deleted;
    }

    return $desired_rev;
}

sub known_branches
{
    my ($db) = @_;

    my $sth = $db->prepare('select id, path from branch');
    $sth->execute;

    my %branch;
    while (my ($id, $path) = $sth->fetchrow_array) {
        $branch{$path} = $id;
    }

    croak "there is no branch called 'trunk' in the database"
        unless exists $branch{trunk};

    return \%branch;
}

sub branch_and_path
{
    my ($branches, $path) = @_;
    $path =~ s/^\///;
    return if $path eq '/';     # Don't care about root directory

    # Figure out which branch this path is on.  Do this by checking
    # ever longer prefixes of the path, since that will allow us to
    # find 'trunk' very quickly.
    my @path = split '/', $path;
    my $branch_path = '';
    my $branch_id;
    while (@path) {
        $branch_path .= '/' unless $branch_path eq '';
        $branch_path .= shift @path;
        next unless exists $branches->{$branch_path};
        $branch_id = $branches->{$branch_path};
        last;
    }

    # Ignore changes to files which aren't in a branch we know about.
    return unless defined $branch_id;

    # The file/directory path relative to the branch directory.
    # Empty string for the top level directory.
    $path = $path eq $branch_path ? ''
                                  : substr $path, length($branch_path) + 1;

    return ($branch_id, $path);
}

sub file_guid
{
    my ($db, $branch_id, $path, $revnum) = @_;

    return $db->selectrow_hashref(q{
                select g.id, g.is_dir, g.uri, g.old_uri, g.custom_uri,
                              p.first_revnum, p.last_revnum
                from file_guid g
                inner join file_path p on g.id = p.guid_id
                where p.branch_id = ?
                    and p.path = ?
                    and p.first_revnum <= ?
                    and (p.last_revnum is null or p.last_revnum >= ?)
        }, undef, $branch_id, $path, $revnum, $revnum);
}

sub _add_revision
{
    my ($db, $revnum, $date) = @_;
    assert(defined $revnum) if DEBUG;

    $date = db_datetime($date);
    croak "revision r$revnum has no datetime stamp, or it is invalid"
        unless defined $date;

    db_insert($db, 'revision',
        revnum => $revnum,
        committed_at => $date,
    );
}

sub _adjust_custom_uri
{
    my ($ra, $db, $path, $revnum, $guid) = @_;
    assert($path ne '') if DEBUG;

    my $full_path = "trunk/$path";
    my $props = get_subversion_properties($ra, $full_path, $revnum);
    return unless defined $props;   # not present in trunk

    if (exists $props->{'daizu:guid'}) {
        my $new_uri = validate_uri($props->{'daizu:guid'});
        croak "error in revision $revnum: invalid URI in 'daizu:guid' property on '$full_path'"
            unless defined $new_uri;
        $new_uri = $new_uri->canonical;

        if ($guid->{custom_uri}) {
            if ($guid->{uri} ne $new_uri) {
                # There was a custom URI already, but it has been changed.
                db_update($db, file_guid => $guid->{id}, uri => $new_uri);
            }
        }
        else {
            # The guid property has been added, so switch from the standard
            # guid to the custom one.
            $db->do(q{
                                update file_guid
                                set custom_uri = true,
                                        old_uri = uri,
                                        uri = ?
                                where id = ?
                        }, undef, $new_uri, $guid->{id});
        }
    }
    elsif ($guid->{custom_uri}) {
        # The guid property has been removed, so switch back to the
        # original standard GUID.
        $db->do(q{
                        update file_guid
                        set uri = old_uri,
                                old_uri = null,
                                custom_uri = false
                        where id = ?
                }, undef, $guid->{id});
    }
}

sub _revision_guid_modifications
{
    my ($ra, $db, $trunk_id, $revnum, $modified) = @_;

    for my $path (@$modified) {
        assert($path ne '') if DEBUG;
        my $guid = file_guid($db, $trunk_id, $path, $revnum);
        croak "modified file 'trunk/$path' has no GUID in revision $revnum"
            unless defined $guid;
        db_update($db, file_guid => $guid->{id},
                  last_changed_revnum => $revnum);
        _adjust_custom_uri($ra, $db, $path, $revnum, $guid);
    }
}

sub _revision_guid_path_changes
{
    my ($cms, $ra, $db, $trunk_id, $revnum, $added, $copied, $deleted) = @_;

    # Record last revnum of deleted paths.
    for my $del (@$deleted) {
        my ($branch_id, $path) = @$del;

        if ($path eq '') {
            # If the top-level directory is deleted, that means delete
            # everything on the branch.
            db_update($db, file_path => { branch_id => $branch_id,
                                          last_revnum => undef },
                last_revnum => $revnum - 1,
            );
        }
        else {
            db_update($db, file_path => { branch_id => $branch_id,
                                          path => $path,
                                          last_revnum => undef },
                last_revnum => $revnum - 1,
            );

            # If it's a directory, mark the demise of all its children.
            $db->do(q{
                                update file_path
                                set last_revnum = ?
                                where branch_id = ?
                                    and path like ?
                                    and last_revnum is null
                        }, undef, $revnum - 1, $branch_id, like_escape($path) . '/%');
        }
    }

    # Process copies sorted in reverse order, so that subdirectories are
    # done before their parents.  That way, when I process all the paths
    # within a copied directory I can skip any which have already been
    # processed separately, because for example the target subdirectory was
    # copied from somewhere else.
    my %source_guid;
    my %added_path;
    for (sort { $b->{path} cmp $a->{path} } @$copied) {
        # If it's not the top-level directory, process the copy.
        my $is_dir = 1;
        if ($_->{from_path} ne '') {
            my $guid = file_guid($db, $_->{from_branch_id}, $_->{from_path},
                                 $_->{from_rev});
            croak "Error in revision $revnum: file $_->{full_path} copied from source with no GUID ($_->{from_full_path} r$_->{from_rev})"
                unless defined $guid;

            push @{$source_guid{$_->{branch_id}}{$guid->{id}}}, $_;
            undef $added_path{$_->{path}};
            $is_dir = $guid->{is_dir};
        }

        # If the path being copied is a directory, then also copy all of its
        # children from the same source.
        if ($is_dir) {
            my $branch_path =
                db_select($db, branch => $_->{branch_id}, 'path');
            my $from_branch_path =
                db_select($db, branch => $_->{from_branch_id}, 'path');

            my $sth = $db->prepare(q{
                                select path, guid_id
                                from file_path
                                where branch_id = ?
                                    and path like ?
                                    and first_revnum <= ?
                                    and (last_revnum is null or last_revnum >= ?)
                        });
            $sth->execute($_->{from_branch_id},
                          ($_->{from_path} eq ''
                              ? '%'
                              : like_escape($_->{from_path}) . '/%'),
                          $_->{from_rev}, $_->{from_rev});

            my $prefix_len = length $_->{from_path};
            ++$prefix_len if $prefix_len;   # separating /

            while (my ($from_path, $guid_id) = $sth->fetchrow_array) {
                my $from_full_path = "$from_branch_path/$from_path";
                my $child_path = substr $from_path, $prefix_len;
                my $path = $_->{path} eq '' ? $child_path
                                            : "$_->{path}/$child_path";
                next if exists $added_path{$path};
                my $full_path = "$branch_path/$path";
                push @{$source_guid{$_->{branch_id}}{$guid_id}}, {
                    full_path => $full_path,
                    path => $path,
                    branch_id => $_->{branch_id},
                    from_full_path => $from_full_path,
                    from_path => $from_path,
                    from_branch_id => $_->{from_branch_id},
                    from_rev => $_->{from_rev},
                };
                undef $added_path{$path};
            }
        }
    }

    while (my ($branch_id, $guids) = each %source_guid) {
        while (my ($guid_id, $copies) = each %$guids) {
            my @copies = sort { $a->{full_path} cmp $b->{full_path} } @$copies;

            my $guid_already_present = db_row_exists($db, file_path =>
                guid_id => $guid_id,
                branch_id => $branch_id,
                last_revnum => undef,
            );

            # If there isn't already a live path in the target branch for this
            # GUID then one of the copies with it gets to keep it.
            if (!$guid_already_present) {
                my $keep = shift @copies;
                db_insert($db, 'file_path',
                    guid_id => $guid_id,
                    path => $keep->{path},
                    branch_id => $keep->{branch_id},
                    first_revnum => $revnum,
                );

                if ($keep->{branch_id} == $trunk_id) {
                    my $guid = file_guid($db, $trunk_id, $keep->{path},
                                         $revnum);
                    _adjust_custom_uri($ra, $db, $keep->{path}, $revnum, $guid);
                    db_update($db, file_guid => $guid->{id},
                              last_changed_revnum => $revnum);
                }
            }

            # Copies which can't keep their GUID, because it's already live
            # in the target branch, get treated just like adds without history.
            for (@copies) {
                push @$added, [ $_->{branch_id}, $_->{path}, $_->{full_path} ];
            }
        }
    }

    for my $add (@$added) {
        my ($branch_id, $path, $full_path) = @$add;
        next if $path eq '';

        # First mint a new GUID for it.
        my $is_dir = $ra->check_path($full_path, $revnum) == $SVN::Node::dir;
        my ($guid_id, $guid_uri) = mint_guid($cms, $is_dir, $path, $revnum);

        _adjust_custom_uri($ra, $db, $path, $revnum, {
            uri => $guid_uri,
            id => $guid_id,
            is_dir => $is_dir,
        });

        db_insert($db, file_path =>
            guid_id => $guid_id,
            path => $path,
            branch_id => $branch_id,
            first_revnum => $revnum,
        );
    }
}

1;
# vi:ts=4 sw=4 expandtab