Source code for bis_protein_structure.SOLVENTACC

import numpy as np
import os
import shutil
import zipfile
import pickle
from tqdm import tqdm

from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBIO
from Bio.PDB import Structure, Model, Chain, Residue, Atom
from Bio.PDB.SASA import ShrakeRupley
from DSSPparser import parseDSSP

import concurrent.futures
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor

TMscore_path = './TMscore'

res_types = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
             'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
             'UNK': '-', 'XAA': 'X'}
res_map = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L',
           'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'UNK': '-'}

res_map_three_word = list(res_types.keys())
res_map_one_word = list(res_types.values())
restype_to_aaorder = {}
res_map_one_to_three = {}

for i, one_word in enumerate(res_map_one_word):
    res_map_one_to_three[one_word] = res_map_three_word[i]

for i, three_word in enumerate(res_map_three_word):
    if i == 21: i = 20
    restype_to_aaorder[three_word] = i

SA_maximum_map = {'PHE': 210, 'ILE': 175, 'LEU': 170, 'VAL': 155, 'PRO': 145, 'ALA': 115,
                  'GLY': 75, 'MET': 185, 'CYS': 135, 'TRP': 255, 'TYR': 230, 'THR': 140,
                  'SER': 115, 'GLN': 180, 'ASN': 160, 'GLU': 190, 'ASP': 150, 'HIS': 195,
                  'LYS': 200, 'ARG': 225, 'XAA': 170}

atom_types = {"N": 0, "CA": 1, "C": 2, "CB": 3, "O": 4, "CG": 5, "CG1": 6, "CG2": 7, "OG": 8, "OG1": 9, "SG": 10, "CD": 11, "CD1": 12,
              "CD2": 13, "ND1": 14, "ND2": 15, "OD1": 16, "OD2": 17, "SD": 18, "CE": 19, "CE1": 20, "CE2": 21, "CE3": 22, "NE": 23,
              "NE1": 24, "NE2": 25, "OE1": 26, "OE2": 27, "CH2": 28, "NH1": 29, "NH2": 30, "OH": 31, "CZ": 32, "CZ2": 33, "CZ3": 34,
              "NZ": 35, "OXT": 36}

index_to_atom_name = {v: k for k, v in atom_types.items()}



[docs]
def load_pickle_file(file_path):
    """Load data from a pickle file.

    Parameters
    ----------
    file_path : str
        The path to the pickle file.

    Returns
    -------
    data
        The data loaded from the pickle file.
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data




[docs]
def getTMscore(pdb_path1, pdb_path2):
    """Calculate the TM-score between two PDB files.

    Parameters
    ----------
    pdb_path1 : str
        Path to the first PDB file.
    pdb_path2 : str
        Path to the second PDB file.

    Returns
    -------
    float
        The TM-score between the two structures.
    """
    cmd = TMscore_path + ' ' + pdb_path1 + ' ' + pdb_path2
    p = subprocess.Popen(cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         shell=True)
    
    stdout, stderr = p.communicate()
    outputs = stdout.decode().split('\n')
    tm = 0.0
    for line in outputs:
        if line.startswith('TM-score'):
            tm = float(line.split(' ')[5])
    return tm




[docs]
def convert_mmcif_to_dssp(mmcif_path, dssp_root):
    """Convert a MMCIF file to DSSP format.

    Parameters
    ----------
    mmcif_path : str
        Path to the MMCIF file.
    dssp_root : str
        Directory where the DSSP file will be saved.
    """
    mmcif = mmcif_path.split('/')[-1]
    dssp = mmcif.replace('.cif', '.dssp')
    dssp_path = os.path.join(dssp_root, dssp)
    
    dssp_executable = "mkdssp"  # Change this if DSSP is installed with a different name
    subprocess.run([dssp_executable, "-i", mmcif_path, "-o", dssp_path])

    


[docs]
def convert_mmcif_to_dssp_parallel(mmcif_paths, dssp_root, max_workers=4):
    """Convert multiple MMCIF files to DSSP format in parallel.

    Parameters
    ----------
    mmcif_paths : list of str
        List of paths to the MMCIF files.
    dssp_root : str
        Directory where the DSSP files will be saved.
    max_workers : int, optional
        Maximum number of workers for parallel processing (default is 4).
    """
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Using list comprehension to create futures
        futures = [executor.submit(convert_mmcif_to_dssp, mmcif_path, dssp_root) for mmcif_path in mmcif_paths]

        # Wait for all futures to complete
        for future in futures:
            future.result()




[docs]
def convert_pdb_to_dssp(pdb_path, dssp_root):
    """Convert a PDB file to DSSP format.

    Parameters
    ----------
    pdb_path : str
        Path to the PDB file.
    dssp_root : str
        Directory where the DSSP file will be saved.
    """
    pdb = pdb_path.split('/')[-1]
    dssp = pdb.replace('.pdb', '.dssp')
    dssp_path = os.path.join(dssp_root, dssp)
    
    dssp_executable = "mkdssp"  # Change this if DSSP is installed with a different name
    subprocess.run([dssp_executable, "-i", pdb_path, "-o", dssp_path])




[docs]
def contains_non_numeric(input_string):
    """Check if the input string contains non-numeric characters.

    Parameters
    ----------
    input_string : str
        The string to be checked.

    Returns
    -------
    bool
        True if the string contains non-numeric characters, False otherwise.
    """
    for char in input_string:
        if not char.isnumeric():
            return True
    return False




[docs]
def create_structure_from_feature(sequence, all_atom_positions, all_atom_mask, structure_id="pred", model_id=0, chain_id="A"):
    """Create a Biopython Structure object from sequence and atomic features.

    Parameters
    ----------
    sequence : str
        The amino acid sequence.
    all_atom_positions : np.ndarray
        Array of shape (n_residues, n_atoms, 3) containing atom coordinates.
    all_atom_mask : np.ndarray
        Array of shape (n_residues, n_atoms) indicating which atoms are present.
    structure_id : str, optional
        The ID of the structure (default is "pred").
    model_id : int, optional
        The model ID (default is 0).
    chain_id : str, optional
        The chain ID (default is "A").

    Returns
    -------
    Structure
        A Biopython Structure object representing the protein.
    """
    structure = Structure.Structure(structure_id)
    model = Model.Model(model_id)
    chain = Chain.Chain(chain_id)

    for i in range(len(sequence)):
        res_name = res_map_one_to_three.get(sequence[i], "UNK")
        res_id = (' ', i + 1, ' ')  # Residue ID (hetero flag, resSeq, icode)
        residue = Residue.Residue(res_id, res_name, ' ')
        
        # Create atoms for the residue
        for j, atom_position in enumerate(all_atom_positions[i]):
            if all_atom_mask[i, j] == 1:  # Only include present atoms
                atom_name = index_to_atom_name[j]
                atom = Atom.Atom(atom_name, atom_position, 1.0, 1.0, ' ', res_id[1], res_name, res_id)
                residue.add(atom)
        
        chain.add(residue)
    
    model.add(chain)
    structure.add(model)
    return structure




[docs]
def extract_coords_from_pdb(pdb_path):
    """Extract atomic coordinates from a PDB file.

    Parameters
    ----------
    pdb_path : str
        Path to the PDB file.

    Returns
    -------
    tuple
        A tuple containing the sequence and an array of atomic coordinates.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("PDB", pdb_path)

    all_atom_positions = []
    sequence = []
    
    for model in structure:
        for chain in model:
            for res in chain:
                if res.get_id()[0] == ' ':  # Exclude hetero atoms
                    res_name = res.get_resname()
                    sequence.append(res_name)
                    res_positions = []
                    
                    for atom in res:
                        res_positions.append(atom.get_coord())
                    
                    all_atom_positions.append(res_positions)
    
    return sequence, np.array(all_atom_positions)