Source code for segmentation.geostas

"""
Bio3D is an R package that encapsulates different tools for biological
structure analysis. The GeoStaS domain finder tries to locate rigid domains
based on trajectory movements. The generated files come from a custom script
written in R, so their format is controlled by our workflow rather than by the
external project itself.

Documentation can be found at:
http://thegrantlab.org/bio3d/reference/geostas.html
"""

import json
from pathlib import Path
from collections.abc import Iterator
from typing import Tuple

from lib.trajectory import Trajectory
from . import SegmentationParser



[docs]
class Parser(SegmentationParser):
    def __init__(self, pdb_path, clustering_directory_path):
        super().__init__(pdb_path, clustering_directory_path)


[docs]
    def parse(self) -> Iterator[Tuple[str, int, str]]:
        pdb_path, clustering_directory_path = self.paths

        trajectory = Trajectory.from_paths(pdb_path)
        atom_data  = trajectory.select_atoms('name = CA')

        for file in sorted(Path(clustering_directory_path).glob('clustering_kmeans_*.json')):
            atom_groups    = json.loads(Path(file).read_text())
            residue_groups = self._translate_atoms_to_residues(atom_data, atom_groups)
            chopping       = self._generate_chopping(residue_groups)
            method         = "GeoStaS K-means"

            yield (method, len(residue_groups), chopping)

        for file in sorted(Path(clustering_directory_path).glob('clustering_hier_*.json')):
            atom_groups    = json.loads(Path(file).read_text())
            residue_groups = self._translate_atoms_to_residues(atom_data, atom_groups)
            chopping       = self._generate_chopping(residue_groups)
            method         = "GeoStaS Hierarchical"

            yield (method, len(residue_groups), chopping)



[docs]
    def _translate_atoms_to_residues(self, atom_data, atom_groups):
        """
        The output of GeoStaS is (alpha carbon) atom indices (1-indexed) while
        the output of the parser is in residues. This method translates the
        sequential atom indices into the residues they correspond to by using
        an MDAnalysis universe.
        """
        return [
            [
                atom_data[atom_index - 1].resnum
                for atom_index in atom_indices
            ]
            for atom_indices in atom_groups
        ]



[docs]
    def _generate_chopping(self, residue_groups):
        """
        Input: ``[[1, 2, 3], [10, 11, 20, 21], ...]``
        Output: ``1-3,10-11_20,21,...``
        """
        # Structure: { group: [(start, end), (start, end), ...] }
        groupings = {}

        for i, residues in enumerate(residue_groups):
            group = i + 1

            for new_residue in residues:
                if group not in groupings:
                    groupings[group] = [(new_residue, new_residue)]
                    continue

                last_pair = groupings[group][-1]
                start_residue, end_residue = last_pair

                if new_residue - end_residue == 1:
                    # this is the next one, just extend the range:
                    groupings[group][-1] = (start_residue, new_residue)
                else:
                    groupings[group].append((new_residue, new_residue))

        group_descriptions = []
        for i in range(1, len(groupings.keys()) + 1):
            range_description = [f"{start}-{end}" for start, end in groupings[i]]
            group_descriptions.append('_'.join(range_description))

        return ','.join(group_descriptions)
Source code for segmentation.geostas

protein-runway

Navigation

Related Topics