| Python-Bytecode documentation | Contained in the Python-Bytecode distribution. |
Python::Bytecode - Disassemble and investigate Python bytecode
use Python::Bytecode
my $bytecode = Python::Bytecode->new($bytecode);
my $bytecode = Python::Bytecode->new(FH);
for ($bytecode->disassemble) {
print $_->[0],"\n"; # Textual representation of disassembly
}
foreach my $constant (@{$bytecode->constants()}) {
if ($constant->can('disassemble')) {
print "code constant:\n";
for ($constant->disassemble) {
print $_->[0], "\n";
}
}
}
Python::Bytecode accepts a string or filehandle contain Python
bytecode and puts it into a format you can manipulate.
disassembleThis is the basic method for getting at the actual code. It returns an
array representing the individual operations in the bytecode stream.
Each element is a reference to a three-element array containing
a textual representation of the disassembly, the opcode number, (the
opname() function can be used to turn this into an op name) and
the argument to the op, if any.
constantsThis returns an array reflecting the constants table of the code object.
Some operations such as LOAD_CONST refer to constants by index in
this array.
labelsSimilar to constants, some operations branch to labels by index
in this table.
varnamesAgain, when variables are referred to by name, the names are stored as an index into this table.
filenameThe filename from which this compiled bytecode is derived.
There are other methods, but you can read the code to find them. It's not hard, and besides, it's probably easiest to work off the textual representation of the disassembly anyway.
The structure of the decoded bytecode file is reasonably simple.
The output of the new method is an object that represents the fully
parsed bytecode file. This object contains the information about the
bytecode file, as well as the top-level code object for the file.
Each python code object in the bytecode file has its own perl object that represents it. This object can be disassembled, has its own constants (which themselves may be code objects) and its own variables.
The module completely decodes the bytecode object when the bytecode
file is handed to the new method, but to get all the pieces of the
bytecode may require digging into the constants of each code object.
Simon Cozens, simon@cpan.org. Mutation for Python 2.3 by Dan
Sugalski dan@sidhe.org
This code is licensed under the same terms as Perl itself.
| Python-Bytecode documentation | Contained in the Python-Bytecode distribution. |
package Python::Bytecode; use 5.6.0; use strict; our $VERSION = "2.7"; use overload '""' => sub { my $obj = shift; "<Code object ".$obj->{name}.", file ".$obj->{filename}." line ".$obj->{lineno}." at ".sprintf('0x%x>',0+$obj); }, "0+" => sub { $_[0] }, fallback => 1; sub new { my ($class, $fh) = (@_); my $self = bless { }; if (ref $fh) { $self->{fh} = $fh; } else { $self->{stuff} = [ split //, $fh ] } my $magic = $self->r_long(); my $data = _get_data_by_magic($magic); $self->{version} = $Python::Bytecode::versions{$magic}; # What we use to read words from the source. May be r_short or r_long *Python::Bytecode::r_word = $Python::Bytecode::readword{$magic}; $self->r_long(); # Second magic number $self->{mainobj} = $self->r_object(); $self->_init($data); return $self; } sub _get_data_by_magic { require Python::Bytecode::v21; require Python::Bytecode::v22; require Python::Bytecode::v23; my $magic = shift; unless (exists $Python::Bytecode::data{$magic}) { require Carp; Carp::croak("Unrecognised magic number $magic; Only know Python versions " . join ", ", map { "$_ ($Python::Bytecode::versions{$_})" } keys %Python::Bytecode::versions ); } return $Python::Bytecode::data{$magic}; } sub r_byte { my $self = shift; if (exists $self->{stuff}) { ord shift @{$self->{stuff}};} else { ord getc $self->{fh} } } sub r_long { use integer; my $self = shift; my $x = $self->r_byte; $x |= $self->r_byte << 8; $x |= $self->r_byte << 16; $x |= $self->r_byte << 24; return $x; } sub r_short { my $self = shift; my $x = $self->r_byte; $x |= $self->r_byte << 8; $x |= -($x & 0x8000); return $x; } sub r_string { my $self = shift; my $length = $self->r_long; my $buf; if ( exists $self->{stuff}) { $buf = join "", splice ( @{$self->{stuff}},0,$length,() ); } else { read $self->{fh}, $buf, $length; } return $buf; } # This really ought to return a real unicode string, rather than a plain # binary string that we fib about sub r_unicode { my $self = shift; my $length = $self->r_long; my $buf; if ( exists $self->{stuff}) { $buf = join "", splice ( @{$self->{stuff}},0,$length,() ); } else { read $self->{fh}, $buf, $length; } return $buf; } sub r_float { my $self = shift; my $length = $self->r_byte; my $buf; if ( exists $self->{stuff}) { $buf = join "", splice ( @{$self->{stuff}},0,$length,() ); } else { read $self->{fh}, $buf, $length; } $buf += 0; return $buf; } sub r_complex { my $self = shift; my $length = $self->r_byte; my $real; if ( exists $self->{stuff}) { $real = join "", splice ( @{$self->{stuff}},0,$length,() ); } else { read $self->{fh}, $real, $length; } $real += 0; $length = $self->r_byte; my $imag; if ( exists $self->{stuff}) { $imag = join "", splice ( @{$self->{stuff}},0,$length,() ); } else { read $self->{fh}, $imag, $length; } $imag += 0; return bless([$real, $imag], "Python::Bytecode::Complex"); } sub r_object { my $self = shift; my $cooked = shift; my $type = chr $self->r_byte(); return $self->r_code() if $type eq "c"; if ($cooked) { return bless \($self->r_string()), "Python::Bytecode::String" if $type eq "s"; return bless \($self->r_long()), "Python::Bytecode::Long" if $type eq "i"; return bless \do{my $x=undef}, "Python::Bytecode::Undef" if $type eq "N"; } else { return $self->r_string if $type eq "s"; return $self->r_long() if $type eq "i"; return undef if $type eq "N"; # None indeed. } if ($type eq "(") { my @tuple = $self->r_tuple($cooked); return bless [@tuple], "Python::Bytecode::Tuple" unless wantarray; return @tuple; } if ($type eq 'u') { return bless\($self->r_unicode()), "Python::Bytecode::Unicode"; } if ($type eq 'l') { return bless \($self->r_extralong()), "Python::Bytecode::Extralong"; } if ($type eq 'f') { return bless \($self->r_float()), "Python::Bytecode::Float"; } if ($type eq 'x') { return $self->r_complex(); } die "Oops! I didn't implement ".ord($type) . " (". length($type) . " bytes)"; } sub r_tuple { my $self = shift; my $cooked = shift; my $n = $self->r_long; return () unless $n; my @rv; push @rv, scalar $self->r_object($cooked) for (1..$n); return @rv; } # This is an extended precision long read. It'll likely be incorrect if the size of the # long exceeds the precision of perl's double, and it really ought to generate a # Math::BigInt instead. sub r_extralong { my $self = shift; my $n = $self->r_long; my $size = $n<0 ? -$n : $n; my $num = 0; foreach (1..$size) { my $digit = $self->r_short(); $num *= 32768; $num += $digit; } return $num; } sub r_code { my $file = shift; my $self = bless {bytecode => $file}, 'Python::Bytecode::Codeobj'; $self->{argcount} = $file->r_word; # $self->{nlocals} = $file->r_word; # $self->{stacksize}= $file->r_word; # $self->{flags} = $file->r_word; # if ($self->{code} = $file->r_object) { if ($self->{constants}= $file->r_object(1)) { if ($self->{names} = $file->r_object) { if ($self->{varnames} = $file->r_object) { if ($self->{freevars} = $file->r_object) { if ($self->{cellvars} = $file->r_object) { if ($self->{filename} = $file->r_object) { if ($self->{name} = $file->r_object) { $self->{lineno} = $file->r_word; # $self->{lnotab} = $file->r_object; }}}}}}}} return $self; } for (qw(argcount nlocals stacksize flags code constants names varnames freevars cellvars filename name lineno lnotab)) { no strict q/subs/; eval "sub $_ { return \$_[0]->{mainobj}->$_ }"; } sub version { return $_[0]->{version}; } $Parrot::Bytecode::DATA = <<EOF; # This'll amuse you. It's actually lifted directly from dis.py :) # Instruction opcodes for compiled code def_op('STOP_CODE', 0) def_op('POP_TOP', 1) def_op('ROT_TWO', 2) def_op('ROT_THREE', 3) def_op('DUP_TOP', 4) def_op('ROT_FOUR', 5) def_op('UNARY_POSITIVE', 10) def_op('UNARY_NEGATIVE', 11) def_op('UNARY_NOT', 12) def_op('UNARY_CONVERT', 13) def_op('UNARY_INVERT', 15) def_op('BINARY_POWER', 19) def_op('BINARY_MULTIPLY', 20) def_op('BINARY_DIVIDE', 21) def_op('BINARY_MODULO', 22) def_op('BINARY_ADD', 23) def_op('BINARY_SUBTRACT', 24) def_op('BINARY_SUBSCR', 25) def_op('SLICE+0', 30) def_op('SLICE+1', 31) def_op('SLICE+2', 32) def_op('SLICE+3', 33) def_op('STORE_SLICE+0', 40) def_op('STORE_SLICE+1', 41) def_op('STORE_SLICE+2', 42) def_op('STORE_SLICE+3', 43) def_op('DELETE_SLICE+0', 50) def_op('DELETE_SLICE+1', 51) def_op('DELETE_SLICE+2', 52) def_op('DELETE_SLICE+3', 53) def_op('INPLACE_ADD', 55) def_op('INPLACE_SUBTRACT', 56) def_op('INPLACE_MULTIPLY', 57) def_op('INPLACE_DIVIDE', 58) def_op('INPLACE_MODULO', 59) def_op('STORE_SUBSCR', 60) def_op('DELETE_SUBSCR', 61) def_op('BINARY_LSHIFT', 62) def_op('BINARY_RSHIFT', 63) def_op('BINARY_AND', 64) def_op('BINARY_XOR', 65) def_op('BINARY_OR', 66) def_op('INPLACE_POWER', 67) def_op('PRINT_EXPR', 70) def_op('PRINT_ITEM', 71) def_op('PRINT_NEWLINE', 72) def_op('PRINT_ITEM_TO', 73) def_op('PRINT_NEWLINE_TO', 74) def_op('INPLACE_LSHIFT', 75) def_op('INPLACE_RSHIFT', 76) def_op('INPLACE_AND', 77) def_op('INPLACE_XOR', 78) def_op('INPLACE_OR', 79) def_op('BREAK_LOOP', 80) def_op('LOAD_LOCALS', 82) def_op('RETURN_VALUE', 83) def_op('IMPORT_STAR', 84) def_op('EXEC_STMT', 85) def_op('POP_BLOCK', 87) def_op('END_FINALLY', 88) def_op('BUILD_CLASS', 89) HAVE_ARGUMENT = 90 # Opcodes from here have an argument: name_op('STORE_NAME', 90) # Index in name list name_op('DELETE_NAME', 91) # "" def_op('UNPACK_SEQUENCE', 92) # Number of tuple items name_op('STORE_ATTR', 95) # Index in name list name_op('DELETE_ATTR', 96) # "" name_op('STORE_GLOBAL', 97) # "" name_op('DELETE_GLOBAL', 98) # "" def_op('DUP_TOPX', 99) # number of items to duplicate def_op('LOAD_CONST', 100) # Index in const list hasconst.append(100) name_op('LOAD_NAME', 101) # Index in name list def_op('BUILD_TUPLE', 102) # Number of tuple items def_op('BUILD_LIST', 103) # Number of list items def_op('BUILD_MAP', 104) # Always zero for now name_op('LOAD_ATTR', 105) # Index in name list def_op('COMPARE_OP', 106) # Comparison operator hascompare.append(106) name_op('IMPORT_NAME', 107) # Index in name list name_op('IMPORT_FROM', 108) # Index in name list jrel_op('JUMP_FORWARD', 110) # Number of bytes to skip jrel_op('JUMP_IF_FALSE', 111) # "" jrel_op('JUMP_IF_TRUE', 112) # "" jabs_op('JUMP_ABSOLUTE', 113) # Target byte offset from beginning of code jrel_op('FOR_LOOP', 114) # Number of bytes to skip name_op('LOAD_GLOBAL', 116) # Index in name list jrel_op('SETUP_LOOP', 120) # Distance to target address jrel_op('SETUP_EXCEPT', 121) # "" jrel_op('SETUP_FINALLY', 122) # "" def_op('LOAD_FAST', 124) # Local variable number haslocal.append(124) def_op('STORE_FAST', 125) # Local variable number haslocal.append(125) def_op('DELETE_FAST', 126) # Local variable number haslocal.append(126) def_op('SET_LINENO', 127) # Current line number SET_LINENO = 127 def_op('RAISE_VARARGS', 130) # Number of raise arguments (1, 2, or 3) def_op('CALL_FUNCTION', 131) # #args + (#kwargs << 8) def_op('MAKE_FUNCTION', 132) # Number of args with default values def_op('BUILD_SLICE', 133) # Number of items def_op('CALL_FUNCTION_VAR', 140) # #args + (#kwargs << 8) def_op('CALL_FUNCTION_KW', 141) # #args + (#kwargs << 8) def_op('CALL_FUNCTION_VAR_KW', 142) # #args + (#kwargs << 8) def_op('EXTENDED_ARG', 143) EXTENDED_ARG = 143 EOF # Set up op code data structures sub _init { my $self = shift; my $data = shift; my @opnames; my %c; # Natty constants. my %has; for (split /\n/, $data) { # This ought to come predigested, but I am lazy next if /^#/ or not /\S/; if (/^def_op\('([^']+)', (\d+)\)/) { $opnames[$2]=$1; } elsif (/^(jrel|jabs|name)_op\('([^']+)', (\d+)\)/) { $opnames[$3]=$2; $has{$1}{$3}++ } elsif (/(\w+)\s*=\s*(\d+)/) { $c{$1}=$2; } elsif (/^has(\w+)\.append\((\d+)\)/) { $has{$1}{$2}++ } } $self->{opnames} = \@opnames; $self->{has} = \%has; $self->{c} = \%c; } sub disassemble { my $self = shift; return $self->{mainobj}->disassemble; } # Now we've read in the op tree, disassemble it. package Python::Bytecode::Codeobj; use overload '""' => sub { my $obj = shift; "<Code object ".$obj->{name}.", file ".$obj->{filename}." line ".$obj->{lineno}." at ".sprintf('0x%x>',0+$obj); }, "0+" => sub { $_[0] }, fallback => 1; for (qw(argcount nlocals stacksize flags code constants names varnames freevars cellvars filename name lineno lnotab)) { no strict q/subs/; eval "sub $_ { return \$_[0]->{$_} }"; } sub findlabels { my $self = shift; my $bytecode = $self->{bytecode}; my %labels = (); my @code = @_; my $offset = 0; while (@code) { my $c = shift @code; $offset++; if ($c>=$bytecode->{c}{HAVE_ARGUMENT}) { my $arg = shift @code; $arg += (256 * shift (@code)); $offset += 2; if ($bytecode->{has}{jrel}{$c}) { $labels{$offset + $arg}++ }; if ($bytecode->{has}{jabs}{$c}) { $labels{$offset}++ }; } } return %labels; } my @cmp_op = ('<', '<=', '==', '!=', '>', '>=', 'in', 'not in', 'is', 'is not', 'exception match', 'BAD'); sub __printconst { my $thing = shift; my $class = ref $thing; if ($class =~ /String/ || $class =~ /Long/) { return $$thing; } if ($class =~ /Undef/) { return ""; } return $thing; } sub disassemble { my $self = shift; my $bytecode = $self->{bytecode}; my @code = map { ord } split //, $self->{code}; my %labels = $self->findlabels(@code); my $offset = 0; my $extarg = 0; my @dis; while (@code) { my $c = shift @code; my $text = (($labels{$offset}) ? ">>" : " "); $text .= sprintf "%4i", $offset; $text .= sprintf "%20s", $self->opname($c); $offset++; my $arg; if ($c>=$bytecode->{c}{HAVE_ARGUMENT}) { $arg = shift @code; $arg += (256 * shift (@code)) + $extarg; $extarg = 0; $extarg = $arg * 65535 if ($c == $bytecode->{c}{EXTENDED_ARG}); $offset+=2; $text .= sprintf "%5i", $arg; $text .= " (".__printconst($self->{constants}->[$arg]).")" if (ref $self->{constants}->[$arg] && $bytecode->{has}{const} && $bytecode->{has}{const}{$c}); $text .= " (".$self->{varnames}->[$arg].")" if ($bytecode->{has}{"local"}{$c}); $text .= " [".$self->{names}->[$arg]."]" if ($bytecode->{has}{name}{$c}); $text .= " [".$cmp_op[$arg]."]" if ($bytecode->{has}{compare}{$c}); $text .= " (to ".($offset+$arg).")" if ($bytecode->{has}{jrel}{$c}); } push @dis, [$text, $c, $arg]; } return @dis; } sub opname { $_[0]->{bytecode}{opnames}[$_[1]] } 1;