OOPS::GC - Garbage Collector for OOPS


OOPS documentation Contained in the OOPS distribution.

Index


Code Index:

NAME

Top

 OOPS::GC - Garbage Collector for OOPS

SYNOPSIS

Top

 use OOPS::GC;

 gc(%args);

DESCRIPTION

Top

OOPS::GC provides a garbage collector for your persistent data. You only need this if you've got a persistent memory leak in your program. The way to leak memory is to make a circular reference between objects and then delete all reference to the objects with the circular reference.

It's easy to do accidently.

If you find your database is growing more than you think it should, run OOPS::CG::gc() on it.

For the database that support concurrent access (ie: not SQLite), the garbage collector runs in the background and does not lock up the database. The garbage collector may need to temporarily store additional information in the database so don't run it when your disk is already full!

INVOCATION

Top

The arguments you pass to gc() are exactly the same as the arguments you pass to OOPS::dbiconnect().

LICENSE

Top

Same as for OOPS.


OOPS documentation Contained in the OOPS distribution.

package OOPS::GC;

use OOPS;
require OOPS::Setup;
use OOPS::TxHash;
require Exporter;
use strict;
use warnings;
use Carp qw(confess);

our @ISA = qw(Exporter);
our @EXPORT = qw(gc);

# scale adjusts the limits
our $min_scale_factor	= 0.5;
our $max_scale_factor	= 20;
our $scale_up		= 1.2;
our $scale_down		= 0.8;

# limits
our $too_many_todo	= 5000;
our $work_length	= 200;
our $clear_batchsize	= 50;
our $virtual_hash_slice	= 20;
our $maximum_spill_size	= 500;

# debugging
our $debug		= 0;
our $debug_scale  	= 0;
our $debug_cleaned	= 0;

# counters
our $overflow_count	= 0;
our $readsaved_count	= 0;
our $error;

#
# Mark objects at the last minute, just before processing
# them?  Advantage: simpler, less chance of collision with
# other users of the database.  Disadvantage: more database
# query results because objects on the todo list will be
# returned.
#
our $mark_last_minute = 1;    

sub gc
{
	my (%args) = @_;

	$error = undef;
	my $dbms;
	my $prefix;
	my $hasbig = 1;
	transaction(sub {
		my $dbh;
		($dbh, $dbms, $prefix) = OOPS->dbiconnect(%args, readonly => 1);
		$dbh->disconnect();
	});
	require "OOPS/$dbms.pm";

	#
	# We'll need to transform the queries...
	#
	my $TPsub = sub {
		my ($query) = @_;
		$query =~ s/TP_/$prefix/g;
		return $query;
	};

	my $gcgen;
	transaction(sub {
		my ($dbh, $dbms, $prefix, $dbo) = OOPS->dbiconnect(%args);
		transaction_lock($dbo);
		undef $dbo;
		($gcgen) = $dbh->selectrow_array(&$TPsub(<<END));
			SELECT pval
			FROM TP_attribute
			WHERE id = 2 AND pkey = 'GC GENERATION'
END
		unless ($gcgen) {
			print "### ERROR: ".$dbh->errstr()."\n";
			confess $dbh->errstr() 
		}
		my $old = $gcgen++;
		print "# GC: New generation number: $gcgen\n";
		$dbh->do(&$TPsub(<<END), undef, $gcgen, $old) or confess $dbh->errstr;
			UPDATE	TP_attribute
			SET	pval = ?
			WHERE	id = 2 AND pkey = 'GC GENERATION' AND pval = ?
END
		$dbh->do(&$TPsub(<<END), undef, $gcgen, $OOPS::last_reserved_oid) or confess $dbh->errstr;
			UPDATE	TP_object
			SET	gcgeneration = ?
			WHERE	id <= ?
END
		$dbh->do(&$TPsub(<<END)) or confess $dbh->errstr;
			DELETE	FROM TP_attribute
			WHERE	id = $OOPS::gc_overflow_id
END
		$dbh->commit;
	});

	exit if $debug == 99;

	my $foo;
	my %todo_real = (
		1	=> \$foo,		# virtual hash
		2	=> undef,		# hash
		3	=> undef,		# hash
	);

	#
	# Stating at the root nodes, follow all references changing the 
	# GC generation as we go.
	#

	my $bailout = 0;
	my $scale_factor = 1;
	my $last_overflow_rows = 0;

	while (%todo_real && ! $bailout) {
		printf "# GC: Top of loop, todo = %d, scale = %.1f\n", scalar(keys %todo_real), $scale_factor if $debug || $debug_scale;
		my $accomplished = 0;
		my $overflow_rows = 0;
		my $restarts = 0;
		transaction(sub {
			my $th = tie my %todo, 'OOPS::TxHash', \%todo_real or confess;

			my ($dbh, $dbms, $prefix, $dbo) = OOPS->dbiconnect(%args);
			transaction_lock($dbo) if ++$restarts > 3;

			my ($curgcgen) = $dbh->selectrow_array(&$TPsub(<<END));
				SELECT pval
				FROM TP_attribute
				WHERE id = 2 AND pkey = 'GC GENERATION'
END
			if ($curgcgen != $gcgen) {
				$bailout = "Another GC is running, only one allowed at a time";
				print "# GC: curgen($curgcgen) != gcgen($gcgen)\n" if $debug;
				%todo_real = ();
				return;
			}

			my $overflow_rowsQ = $dbo->adhoc_query(<<END);
				SELECT	count(*)
				FROM	TP_attribute
				WHERE	id = $OOPS::gc_overflow_id
END
			$overflow_rowsQ->execute() || confess;
			($overflow_rows) = $overflow_rowsQ->fetchrow_array();
			$overflow_rowsQ->finish;

			my $work = 0;
			my $objects_done = 0;
			my $nlimit = int($virtual_hash_slice * $scale_factor);

			printf "# GC: Inner loop, work = %d, todo = %d\n", $work, scalar(%todo) if $debug;
			while ($work < $work_length * $scale_factor && %todo && %todo <= $too_many_todo * $scale_factor) {
				my ($id, $key);
				1 while not (($id, $key) = each(%todo)); # OOPS::TxHash better be right about SCALAR...

				printf "# GC: working on %d/%s\n", $id, defined($key) ? $key : "<undef>" if $debug;

				my $q;
				my $oldkey = 0;
				my $limit = defined($key) ? "LIMIT $nlimit" : "";
				if (defined $key && ! ref($key)) {
					print "# GC: continuing through a virtual hash\n" if $debug;
					$q = $dbo->adhoc_query(<<END, execute => [ $id, $key, $gcgen]) or confess $dbh->errstr;
						SELECT	o.id, o.otype, o.virtual, a.pkey
						FROM	TP_object AS o,
							TP_attribute AS a
						WHERE	a.id = ? 
						  AND	a.pkey > ?
						  AND	a.ptype = 'R'
						  AND	o.id = DBO:CAST:PGBYTEA2INT(a.pval)
						  AND	o.gcgeneration < ?
						ORDER	BY a.pkey
						LIMIT	$nlimit
END
					$oldkey = 1;
				} else {
					print "# GC: starting on a fresh object\n" if $debug;
					$objects_done++;
					if ($mark_last_minute) {
						print "# GC: marking $id as gen $gcgen\n" if $debug > 5;
						$q = $dbo->adhoc_query(<<END, execute => [$gcgen, $id, $gcgen]) or confess $dbh->errstr;
							UPDATE	TP_object 
							SET	gcgeneration = ?
							WHERE	id = ?
							  AND	gcgeneration < ?
END
						$q->finish;
					}
					$q = $dbo->adhoc_query(<<END, execute => [ $id, $gcgen ]) or confess $dbh->errstr;
						SELECT	o.id, o.otype, o.virtual, a.pkey
						FROM	TP_object AS o,
							TP_attribute AS a
						WHERE	a.id = ? 
						  AND	a.ptype = 'R'
						  AND	o.id = DBO:CAST:PGBYTEA2INT(a.pval)
						  AND	o.gcgeneration < ?
						ORDER	BY a.pkey
						$limit
END
				}


				my ($count, $newkey) = get_todo($q, \%todo);
				$work += $count || 1;

				print "# GC: found $count new things to worry about\n" if $debug;

				#
				# mark objects as done as they are added to the todo list
				#
				if ($mark_last_minute) {
					if ($limit and $count == $nlimit) {
						$todo{$id} = $newkey;
					} else {
						delete $todo{$id};
					}
				} elsif ($limit and $count == $nlimit) {
					if ($oldkey) {
						print "# GC: partial set from a virtual hash that was partway through '$key' to '$newkey'\n" if $debug;
						$q = $dbo->adhoc_query(<<END, execute => [$gcgen, $id, $key, $newkey, $gcgen]) or confess $dbh->errstr;
							UPDATE	TP_object AS o,
								TP_attribute AS a
							SET	o.gcgeneration = ?
							WHERE	a.id = ?
							  AND	a.pkey > ?
							  AND	a.pkey <= ?
							  AND	a.ptype = 'R'
							  AND	o.id = DBO:CAST:PGBYTEA2INT(a.pval)
							  AND	o.gcgeneration < ?
END
					} else {
						# partial set from a virtual hash that just started
						print "# GC: partial set from a new virtual hash ... upto '$newkey'\n" if $debug;
						$q = $dbo->adhoc_query(<<END, execute => [$gcgen, $id, $newkey, $gcgen]) or confess $dbh->errstr;
							UPDATE	TP_object AS o,
								TP_attribute AS a
							SET	o.gcgeneration = ?
							WHERE	a.id = ?
							  AND	a.pkey <= ?
							  AND	a.ptype = 'R'
							  AND	o.id = a.pval
							  AND	o.gcgeneration < ?
END
					}
					$todo{$id} = $newkey;
				} elsif ($limit && $oldkey) {
					print "# GC: finish a partially done virtual hash ...  from '$key'\n" if $debug;
					$q = $dbo->adhoc_query(<<END, execute => [$gcgen, $id, $key, $gcgen]) or confess $dbh->errstr;
						UPDATE	TP_object AS o,
							TP_attribute AS a
						SET	o.gcgeneration = ?
						WHERE	a.id = ?
						  AND	a.pkey > ?
						  AND	a.ptype = 'R'
						  AND	o.id = DBO:CAST:PGBYTEA2INT(a.pval)
						  AND	o.gcgeneration < ?
END
					delete $todo{$id};
				} else {
					print "# GC: finished an object\n" if $debug;
					$q = $dbo->adhoc_query(<<END, execute => [$gcgen, $id, $gcgen]) or confess $dbh->errstr;
						UPDATE	TP_object AS o,
							TP_attribute AS a
						SET	o.gcgeneration = ?
						WHERE	a.id = ?
						  AND	a.ptype = 'R'
						  AND	o.id = a.pval 
						  AND	o.gcgeneration < ?
END
					delete $todo{$id};
				}

			}

			#
			# At this point, we've either done enough work to finish the
			# transaction or we've run out of things to do.  First, if we've
			# run out, see if other processes have left us some new stuff
			# for our todo list.
			#

			while (! %todo) {
				print "# GC: TODO is empty, look for more\n" if $debug;
				my $q = $dbo->adhoc_query(<<END, execute => [$gcgen]) or confess $dbh->errstr;
					SELECT	o.id, o.otype, o.virtual, a.pkey
					FROM	TP_object AS o,
						TP_attribute AS a
					WHERE	a.id = $OOPS::gc_overflow_id
					  AND	o.id = DBO:CAST:PGBYTEA2INT(a.pkey)
					  AND	o.gcgeneration < ?
					ORDER BY a.pkey
					LIMIT 	$nlimit
END
				my ($count, $pkey) = get_todo($q, \%todo);
				$q->finish;
				my @args;
				if ($count == $nlimit) {
					$q = $dbo->adhoc_query(<<END) or confess $dbh->errstr;
						DELETE FROM TP_attribute
						WHERE	id = $OOPS::gc_overflow_id
						  AND	pkey <= ?
END
					@args = ($pkey);
				} else {
					$q = $dbo->adhoc_query(<<END) or confess $dbh->errstr;
						DELETE FROM TP_attribute
						WHERE	id = $OOPS::gc_overflow_id
END
					@args = ();
				}
				my $r = $q->execute(@args);
				confess unless $r;
				$objects_done += $r - $count;
				$q->finish;
				$readsaved_count += $count;
				print "# GC: TODO was empty, found $count more\n" if $debug;
				last unless $count;
			}

			#
			# Alternatively, if we're gotten here because we've done enough
			# let's check to make sure our todo list isn't getting too big.  If it
			# is, let's save some of it to the database.
			#

			if (%todo > $too_many_todo * $scale_factor) {
				print "# GC: TODO is overflowing, save some for later\n" if $debug;
				my $q1 = $dbo->adhoc_query(<<END) or confess $dbh->errstr;
					SELECT	COUNT(*)
					FROM	TP_attribute
					WHERE	id = $OOPS::gc_overflow_id
					  AND	pkey = ?
END
				my $q2 = $dbo->adhoc_query(<<END) or confess $dbh->errstr;
					INSERT INTO TP_attribute
					VALUES ($OOPS::gc_overflow_id, ?, '', '0')
END
				my $spilled = 0;
				for my $id (keys %todo) {
					next if defined $todo{$id};
					$q1->execute($id) or confess $dbh->errstr;
					my ($rc) = $q1->fetchrow_array();
					next if $rc > 0;
					$q2->execute($id) or confess $dbh->errstr;
					delete $todo{$id};
					$overflow_count++;
					last if %todo < $too_many_todo * $scale_factor / 2;
					last if ++$spilled > $maximum_spill_size * $scale_factor;
				}
				$q1->finish();
				$q2->finish();
			}
			#
			# Okay, time to record what we've done and go 'round again.
			#

			print "# GC commit\n" if $debug;
			$dbh->commit or confess $dbh->errstr;
			$th->commit;
			$accomplished = $objects_done;
		});
		printf "# GC: accomplished: %d change in overflow rows: %d old scale: %.2f restarts: %d\n", $accomplished, $overflow_rows - $last_overflow_rows, $scale_factor, $restarts if $debug_scale;
		if ($accomplished < $overflow_rows - $last_overflow_rows) {
			# we didn't get as much done as new work came in.
			if ($restarts < 5) {
				# try to do more work in each transaction
				$scale_factor *= $scale_up;
			} else {
				# maybe we're trying to do too much at once?
				$scale_factor *= $scale_down;
			}
		} else {
			# let's do less at once so we have a smaller impact
			# on other transactions
			$scale_factor *= ((1+$scale_down)/2);
		}
	}

	if ($bailout) {
		print STDERR "# GC: $bailout\n";
		$error = $bailout;
		return undef;
	}

	my $total = 0;

	#
	# Now, we want to run a scan across all objects but we don't
	# want to lock the whole database as we do it.  
	#

	print "# GC: Scanning for objects we didn't mark\n" if $debug || $debug_cleaned;

	for(;;) {
		my $idlast = -1;
		my $last;
		my $restarts = 0;
		transaction(sub {
			my ($dbh, $dbms, $prefix, $dbo) = OOPS->dbiconnect(%args);
			transaction_lock($dbo) if ++$restarts > 3;
			my $q = $dbh->prepare(&$TPsub(<<END)) or confess $dbh->errstr;
				SELECT	id
				FROM	TP_object
				WHERE	id > ?
				  AND	gcgeneration < ?
				LIMIT	$clear_batchsize
END
			$q->execute($idlast, $gcgen) or confess $dbh->errstr;
			my @idset;
			while (my ($id) = $q->fetchrow_array()) {
				push(@idset, $id);
			}
			$q->finish();
			return unless @idset;

			print "# GC: found @idset, will delete 'em\n" if $debug || $debug_cleaned;

			my $decrement = $dbo->adhoc_query(<<END) or confess $dbh->errstr;
				# DBO:name GC::decrement
				:sqlite2:
					UPDATE	TP_object 
					SET	refs = refs - 1
					WHERE	id IN (
						SELECT	pval
						FROM	TP_attribute
						WHERE	id = ?
						  AND	ptype = 'R'
						)
				:sqlite:
					UPDATE	TP_object 
					SET	refs = refs - 1
					WHERE	id IN (
						SELECT	pval
						FROM	TP_attribute
						WHERE	id = ?
						  AND	ptype = 'R'
						)
				:mysql:
					UPDATE	TP_object AS o, TP_attribute AS a
					SET	o.refs = o.refs - 1
					WHERE	a.id = ?
					  AND	a.ptype = 'R'
					  AND	o.id = a.pval
				:pg:
					UPDATE	TP_object 
					SET	refs = refs - 1
					FROM	TP_attribute
					WHERE	TP_attribute.id = ?
					  AND	TP_attribute.ptype = 'R'
					  AND	TP_object.id = DBO:CAST:PGBYTEA2INT(TP_attribute.pval)
END
			my $ro = $dbh->prepare(&$TPsub(<<END)) or confess $dbh->errstr;
				DELETE FROM TP_object
				WHERE id = ?
END
			my $ra = $dbh->prepare(&$TPsub(<<END)) or confess $dbh->errstr;
				DELETE FROM TP_attribute
				WHERE id = ?
END
			my $ref = $dbh->prepare(&$TPsub(<<END)) or confess $dbh->errstr;
				UPDATE	TP_attribute
				SET	pval = NULL, ptype = '0'
				WHERE	pval = ?
END
			for my $id (@idset) {
				$decrement->execute($id) or confess $dbh->errstr;
				$ro->execute($id) or confess $dbh->errstr;
				$ra->execute($id) or confess $dbh->errstr;
				$ref->execute($id) or confess $dbh->errstr;
			}

			if ($hasbig) {
				my $rb = $dbh->prepare(&$TPsub(<<END)) or confess $dbh->errstr;
					DELETE FROM TP_big
					WHERE id = ?
END
				for my $id (@idset) {
					$rb->execute($id) or confess $dbh->errstr;
				}
			}
			$dbh->commit or confess $dbh->errstr;
			$total += @idset;
			$last = $idset[$clear_batchsize-1];
		});
		last unless $last;
		$idlast = $last;
	};

	print "# GC $total objects removed\n" if $debug;
	return $total;
}

sub get_todo
{
	my ($q, $todoref) = @_;

	my ($oid, $otype, $ovirtual, $pkey);
	my $count = 0;
	while (($oid, $otype, $ovirtual, $pkey) = $q->fetchrow_array()) {
		print "# GC:	more todo $oid $otype $ovirtual $pkey\n" if $debug;
		$count++;
		next if exists $todoref->{$oid};
		if ($ovirtual) {
			$todoref->{$oid} = \'VIRTUAL';
		} else {
			$todoref->{$oid} = undef;
		}
	}
	$q->finish;
	return $count unless wantarray();
	return ($count, $pkey);
}

#
# Hopefully this will avoid persistent deadlock problems
# with mysql.
#
sub transaction_lock
{
	my ($dbo) = @_;
	$dbo->adhoc_query(<<END, execute => [1]);
		SELECT	counter
		FROM	TP_object
		WHERE	id = ?
END
}

1;

__END__

We're trying to do a recursive traversal of all data structures w/o locking much
of the database at a time.

We do this by keeping a todo list of things that we haven't yet processed.
Things on the todo list have have been marked but their children have not.

Since this is done live, what happens when we have 


	$root => {
		A	=> {
			foo	=> (bless { }, bar),
		},
		B	=> {
		},
	}

and then after we've processed B, but before we've gotten to A, foo moves from A to B?


To handle this, OOPS needs to be able to signal us whenever it adds a new reference
from an object that has been GC'ed to one that has not.



SELECT	o.id, o.otype, o.virtual, a.pkey
FROM	charm2object AS o,
charm2attribute AS a
WHERE	a.id = 113 
AND	a.ptype = 'R'
AND	o.id = a.pval 
AND	o.gcgeneration < 3
ORDER	BY a.pkey;