Source code for bis_protein_structure.split_pdb

from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.mmcifio import MMCIFIO
from Bio.PDB import Dice
import os
import pickle
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
from tqdm import tqdm
import re
import numpy as np

res_types = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 
    'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 
    'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 
    'TYR': 'Y', 'VAL': 'V', 'UNK': '-'
}


[docs]
def extract(structure, chain_id, start, end, filename):
    """Write out selected portion of a structure to a file.

    Parameters
    ----------
    structure : Bio.PDB.Structure
        The structure object containing the protein data.
    chain_id : str
        The identifier for the chain to extract.
    start : int
        The starting residue index for extraction.
    end : int
        The ending residue index for extraction.
    filename : str
        The path to the output file where the selected portion will be saved.

    Returns
    -------
    None
    """
    sel = Dice.ChainSelector(chain_id, start, end)
    io = MMCIFIO()
    io.set_structure(structure)
    io.save(filename, sel)




[docs]
def split_the_pdb(cif_path, pdb, chain):
    """Split a PDB file into separate CIF files for a specified chain.

    Parameters
    ----------
    cif_path : str
        The path to the input CIF file.
    pdb : str
        The name of the PDB structure.
    chain : str
        The chain identifier to extract from the PDB structure.

    Returns
    -------
    None
    """
    p = MMCIFParser(QUIET=True)
    s = p.get_structure(f'{pdb}_{chain}', cif_path)
    for m in s:
        for cn, c in enumerate(m):
            if chain != c.id:
                continue

            pdb_seq = ''
            tmp_res = list()

            for i, res in enumerate(c):
                if res.resname in res_types:
                    pdb_seq += res_types[res.resname]
                    tmp_res.append(res.get_id()[1])

            if len(pdb_seq) == 0:
                continue

            starting_index = int(tmp_res[0])
            real_len = len(pdb_seq)
            index_len = real_len - 1

            filename = f'{new_path}/{pdb}_{c.id}.cif'
            extract(structure=s, chain_id=c.id, start=starting_index, end=starting_index + index_len, filename=filename)