Chemistry::File::SLN - SLN linear notation parser/writer


Chemistry-File-SLN documentation Contained in the Chemistry-File-SLN distribution.

Index


Code Index:

NAME

Top

Chemistry::File::SLN - SLN linear notation parser/writer

SYNOPSYS

Top

    #!/usr/bin/perl
    use Chemistry::File::SLN;

    # parse a SLN string for benzene
    my $s = 'C[1]H:CH:CH:CH:CH:CH@1';
    my $mol = Chemistry::Mol->parse($s, format => 'sln');

    # print a SLN string
    print $mol->print(format => 'sln');

    # print a unique (canonical) SLN string
    print $mol->print(format => 'sln', unique => 1);

    # parse a multiline SLN file
    my @mols = Chemistry::Mol->read("file.sln", format => 'sln');

    # write a multiline SLN file
    Chemistry::Mol->write("file.sln", mols => [@mols]);




DESCRIPTION

Top

This module parses a SLN (Sybyl Line Notation) string. This is a File I/O driver for the PerlMol project. http://www.perlmol.org/. It registers the 'sln' format with Chemistry::Mol, and recognizes filenames ending in '.sln'.

Optional attributes for atoms, bonds, and molecules are stored as $atom->attr("sln/attr"), $bond->attr("sln/attr"), and $mol->attr("sln/attr"), respectively. Boolean attributes are stored with a value of 'TRUE'. That's the way boolean attributes are recognized when writing, so that they can be written in the shortened form.

    $sln_attr->{backbone} = 1;
    # would be ouput as "C[backbone=1]"

    $sln_attr->{backbone} = 'TRUE';
    # would be ouput as "C[backbone]"

Also note that attribute names are normalized to lowercase on reading.

OPTIONS

Top

The following options are available when reading:

kekulize

Assign bond orders for unsatisfied valences or for aromatic bonds. For example, benzene read as C[1]H:CH:CH:CH:CH:CH@1 will be converted internally to something like C[1]H=CHCH=CHCH=CH@1. This is needed if another format or module expects a Kekule representation without an aromatic bond type.

The following options are available when writing:

mols

If this option points to an array of molecules, these molecules will be written, one per line, as in the example in the SYNOPSYS.

aromatic

Detect aromaticity before writing. This will ensure that aromatic bond types are used instead of alternate single and double bonds.

unique

Canonicalize before writing, and produce a unique strucure. NOTE: this option does not guarantee a unique representation for molecules with bracketed attributes.

name

Include the name of the molecule ($mol->name) in the output string.

coord3d, coords

Include the 3D coordinates of every atom in the molecule in the output string. coord3d and coords may be used interchangeably.

attr

Output the atom, bond, and molecule attributes found in $mol->attr("sln/attr"), etc.

CAVEATS

Top

This version does not implement the full SLN specification. It supports simple structures and some attributes, but it does not support any of the following:

Macro atoms
Pattern matching options
Markush structures
2D Coordinates

The SLN specification is vague on several points, and I don't have a reference implementation available, so I had to make several arbitrary decisions. Also, this version of this module has not been tested exhaustively, so please report any bugs that you find.

If the parser doesn't understand a string, it only says "syntax error", which may not be very helpful.

VERSION

Top

0.11

SEE ALSO

Top

Chemistry::Mol, Chemistry::File, Chemistry::File::SMILES

The PerlMol website http://www.perlmol.org/

Ash, S.; Cline, M. A.; Homer, R. W.; Hurst, T.; Smith, G. B., SYBYL Line Notation (SLN): A Versatile Language for Chemical Structure Representation. J. Chem. Inf. Comput. Sci; 1997; 37(1); 71-79. DOI: 10.1021/ci960109j (http://dx.doi.org/10.1021/ci960109j)

AUTHOR

Top

Ivan Tubert-Brohman <itub@cpan.org>

COPYRIGHT

Top


Chemistry-File-SLN documentation Contained in the Chemistry-File-SLN distribution.
package Chemistry::File::SLN;

$VERSION = "0.11";
# $Id: SLN.pm,v 1.4 2005/03/29 16:38:06 itubert Exp $

use 5.006;
use strict;
use warnings;
use base "Chemistry::File";
use Chemistry::Mol;
use Chemistry::File::SLN::Parser;
use Chemistry::Bond::Find 'assign_bond_orders';
use List::Util qw(sum);

# INITIALIZATION
Chemistry::Mol->register_format('sln');
my $Parser = Chemistry::File::SLN::Parser->new;

sub name_is {
    my ($self, $name) = @_;
    $name =~ /\.sln$/i;
}

sub file_is {
    $_[0]->name_is($_[1]);
}

sub parse_string {
    my ($self, $string, %opts) = @_;

    my (@lines) = split /(?:\n|\r\n?)/, $string;
    my @mols;
    for my $line (@lines) {
        my $mol = $self->parse_single_line($line, %opts);
        return $mol unless wantarray;
        push @mols, $mol;
    }
    @mols;
}

sub parse_single_line {
    my ($self, $string, %opts) = @_;

    my $mol_class = $opts{mol_class} || "Chemistry::Mol";


    # call the actual yapp-generated parser
    my $tree = $Parser->run($string) or return;
    #use Data::Dumper; print Dumper $tree;

    my $mol = $mol_class->new;
    my @nodes = @{$tree->{chain}};
    my %closures;
    my $last_atom;
    my @stack;
    
    while (my $node = shift @nodes) {
        if ($node eq '(') {
            push @stack, $last_atom;
        } elsif ($node eq ')') {
            $last_atom = pop @stack;
        } elsif($last_atom) { # bond
            my $next = shift @nodes;
            if ($next->{closure}) {
                my $atom = $closures{$next->{closure}};
                $self->compile_bond($mol, $node, $last_atom, $atom);
            } else {
                my $atom = $self->compile_atom($mol, $next, \%closures);
                $self->compile_bond($mol, $node, $last_atom, $atom);
                $last_atom = $atom;
            }
        } else {  # first atom
            $last_atom = $self->compile_atom($mol, $node, \%closures);
        }
    }
    if ($opts{kekulize}) {
        assign_bond_orders($mol, method => "itub", use_coords => 0, 
            scratch => 0, charges => 0);
    }
    my @sln_attr;
    while (my ($attr, $value) = each %{$tree->{attr}}) {
        if ($attr eq 'name') {
            $mol->name($value);
        } elsif ($attr eq 'type') {
            $mol->type($value);
        } elsif ($attr eq 'coord3d') {
            $self->read_coords($mol, $value);
        } else {
            push @sln_attr, $attr, $value;
        }
    }
    $mol->attr("sln/attr", {@sln_attr}) if @sln_attr;
    $mol;

}

sub compile_atom {
    my ($self, $mol, $node, $closures) = @_;
    my $atom = $mol->new_atom(
        symbol          => $node->{symbol},
        hydrogens       => $node->{hcount},
        formal_charge   => $node->{attr}{charge},
    );
    $atom->attr("sln/attr", $node->{attr});
    delete $node->{attr}{charge};
    $closures->{$node->{id}} = $atom if $node->{id};
    $atom;
}

my %TYPE_TO_ORDER = (
    '-' => 1,
    '=' => 2,
    '#' => 3,
    ':' => 1, 
    '.' => 0,
);

sub compile_bond {
    my ($self, $mol, $node, $atom1, $atom2) = @_;
    my $order = $TYPE_TO_ORDER{$node->{type}};
    if ($order) {
        my $bond = $mol->new_bond(
            type => $node->{type}, 
            atoms=>[$atom1, $atom2],
            order => $order,
        );
        $bond->attr("sln/attr", $node->{attr});
        if ($node->{type} eq ':') { 
            $_->aromatic(1) for ($atom1, $atom2, $bond);
        }
    }
}

sub read_coords {
    my ($self, $mol, $coords_str) = @_;
    $coords_str =~ s/[()]//g;
    my (@coords) = split /,/, $coords_str;
    my $fh = $mol->formula_hash;
    my $n = sum(values %$fh);
    my $sprout = (@coords == 3*$n);
    for my $atom ($mol->atoms) {
        $atom->coords(splice @coords, 0, 3);
        if ($sprout) {
            for (1 .. $atom->implicit_hydrogens) {
                my $H = $mol->new_atom(symbol => 'H', 
                    coords => [splice @coords, 0, 3]);
                $mol->new_bond(atoms => [$atom, $H]);
            }
            $atom->implicit_hydrogens(0);
        } 
    }
}


########### WRITER #################


sub write_string {
    my ($self, $mol_ref, %opts) = @_;

    my $eol;
    my @mols;
    if ($opts{mols}) {
        @mols = @{$opts{mols}};
        $eol = "\n";
    } else {
        @mols = $mol_ref; 
        $eol = "";
    }

    my $sln;
    for my $mol (@mols) {
        $sln .= $self->write_mol($mol, %opts) . $eol;
    }
    $sln;
}

sub write_mol {
    my ($self, $mol, %opts) = @_;

    my $oldmol = $mol;
    $mol = $mol->clone; 

    my $sln = '';
    my @id_log;
    if ($mol->atoms) {
        my @atoms = $self->clean_mol($mol, %opts);

        my $visited = {};
        my @s;
        for my $atom (@atoms) {
            next if $visited->{$atom};
            my $ring_atoms = {};

            # first pass to find and number the ring bonds
            $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms);

            # second pass to actually generate the sln string
            push @s, $self->branch($mol, \%opts, $atom, undef, $visited, 
                $ring_atoms, \@id_log);
        }
        $sln .= join '.', @s;
    }

    $sln .= $self->format_ctab_attr($mol, \%opts, $oldmol, \@id_log);
}

sub clean_mol {
    my ($self, $mol, %opts) = @_;

    $self->collapse_hydrogens($mol);
    my @atoms = $mol->atoms;
    if ($opts{unique}) {
        unless ($atoms[0]->attr("canon/class")) {
            require Chemistry::Canonicalize;
            Chemistry::Canonicalize::canonicalize($mol);
        }
        #$opts{aromatic} = 1; # all unique sln have to be aromatic
        @atoms = sort {
            $a->attr("canon/class") <=> $b->attr("canon/class")
        } @atoms;
    }

    if ($opts{aromatic}) {
        require Chemistry::Ring;
        Chemistry::Ring::aromatize_mol($mol);
    }
    @atoms;
}

sub format_ctab_attr {
    my ($self, $mol, $opts, $oldmol, $id_log) = @_;

    my $sln = '';
    if ($opts->{name} or $opts->{attr} or $opts->{coords} or $opts->{coord3d}) {
        no warnings 'uninitialized';
        my @attr;
        my $name = $mol->name;
        $name =~ s/[\r\n]//g;
        push @attr, 'name="' . $mol->name . '"' 
            if $opts->{name} and length $mol->name;
        my @coords;
        if ($opts->{coord3d} or $opts->{coords}) {
            my @all_atoms = map { 
                (
                    $oldmol->by_id($_), 
                    grep {$_->symbol eq 'H'}
                        $oldmol->by_id($_)->neighbors
                )
            } @$id_log;
            push @coords, sprintf("(%.3f,%.3f,%.3f)",$_->coords->array)
                for @all_atoms;
            push @attr, 'coord3d=' . join(',',@coords);
        }
        if ($opts->{attr}) {
            push @attr, $self->format_sln_attr($mol);
        }
        $sln .= '<' . join(';', @attr) . '>' if @attr;
    }
    $sln;
}


sub find_ring_bonds {
    my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_;

    $visited->{$atom}  = 1;
    for my $bn (sorted_bonds_neighbors($atom, $opts)) {
        my $nei  = $bn->{to};
        my $bond = $bn->{bond};
        next if $visited->{$bond};
        $visited->{$bond}  = 1;
        if ($visited->{$nei}) { # closed ring
            #print "closing ring\n";
            $ring_atoms->{$nei}++;
        } else {
            $self->find_ring_bonds($mol, $opts, $nei, 
                $bond, $visited, $ring_atoms);
        }
    }
}

sub branch {
    my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits, $id_log) = @_;

    my $prev_branch = "";
    my $sln;
    $sln .= $self->format_bond($from_bond, $opts);
    my $digit;
    if ($digits->{$atom}) {  # opening a ring
        $digit = $self->next_digit($digits);
        $digits->{$atom} = $digit;
    }
    $sln .= $self->format_atom($atom, $opts, $digit);
    push @$id_log, $atom->id;

    $visited->{$atom}  = 1;
    my @bns = sorted_bonds_neighbors($atom, $opts);

    for my $bn (@bns) {
        my $nei  = $bn->{to};
        my $bond = $bn->{bond};
        next if $visited->{$bond};
        $visited->{$bond} = 1;
        if ($visited->{$nei}) { # closed a ring
            if ($prev_branch) {
                $sln .= "($prev_branch)";
            }
            $prev_branch = $self->format_bond($bond, $opts) 
                . '@' . $digits->{$nei};
            $visited->{$bond} = 1;
        } else {
            my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, 
                $digits, $id_log);
            if ($prev_branch) {
                $sln .= "($prev_branch)";
            }
            $prev_branch = $branch;
        }
    }
    $sln .= "$prev_branch";
    $sln;
}

sub next_digit {
    my ($self, $digits) = @_;
    ++$digits->{used_digits};
}

sub collapse_hydrogens {
    my ($self, $mol) = @_;

    for my $atom (grep {$_->symbol eq 'H'} $mol->atoms) {
        my ($neighbor) = $atom->neighbors or next;
        $atom->delete;
        my $h_count = $neighbor->hydrogens;
        $h_count++;
        $neighbor->hydrogens($h_count);
    }
}

sub sorted_bonds_neighbors {
    my ($atom, $opts) = @_;
    my @bn = $atom->bonds_neighbors;
    if ($opts->{unique}) {
        @bn = sort { 
            $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class") 
        } @bn;
    }
    @bn;
}

my %ORDER_TO_TYPE = (
    1 => '', 2 => '=', 3 => '#', 4 => '', 0 => '.', 
);

sub format_bond {
    my ($self, $bond, $opts) = @_;
    return '' unless $bond;
    my $s = $bond->aromatic ? ':' : $ORDER_TO_TYPE{$bond->order};
    my @attr;
    @attr = $self->format_sln_attr($bond) if $opts->{attr};
    if (@attr) {
        $s .= '[' . join(";", @attr) . ']';
    }
    $s;
}

sub format_atom {
    my ($self, $atom, $opts, $digit) = @_;
    my $s;
    no warnings 'uninitialized';
    my $h_count = $atom->hydrogens;
    my $charge  = $atom->formal_charge;
    my $symbol  = $atom->symbol;

    $charge  = $charge ? sprintf("%+d", $charge): '';
    $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : '';

    $s = $symbol;
    my @attr;
    @attr = $self->format_sln_attr($atom) if $opts->{attr};
    if ($charge or $digit or @attr) {
        $s .= '['; 
        $s .= $digit;
        unshift @attr, $charge if $charge;
        if (@attr) {
            $s .= ':' if $digit;
            $s .= join ';', @attr;
       }
       $s .= ']';
    }
    $s .= $h_count;
    $s;
}

sub format_sln_attr {
    my ($self, $obj) = @_;
    my $sln_attr = $obj->attr("sln/attr") || {};
    my @attr;
    for my $key (sort keys %$sln_attr) {
        my $val = $sln_attr->{$key};
        push @attr, "$key" . ($val eq 'TRUE' ? "" : "=$val");
    }
    @attr;
}

1;