Source code for bis_protein_structure.CROSSLINK


[docs]
def readPDB(pdb_dir):
    """
    Reads a PDB file and extracts chain and residue information.

    Parameters
    ----------
    pdb_dir : str
        Path to the PDB file.

    Returns
    -------
    model : Bio.PDB.Model
        The first model from the parsed PDB file.
    chains : list of str
        List of chain identifiers.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.
    """
    parser = PDBParser(PERMISSIVE=1)
    structure = parser.get_structure('pdb', pdb_dir)
    residue_dict = {}
    for model in structure:
        for chain in model:
            chain_id = chain.id
            residue_dict[chain_id] = {}
            for residue in chain:
                res_name = residue.resname
                res_num = residue.id[1]
                if res_name in res_types:
                    residue_dict[chain_id][res_num] = res_name
        break
    chains = list(residue_dict.keys())
    return model, chains, residue_dict




[docs]
def readMMCIF(mmcif_path):
    """
    Reads an MMCIF file and extracts chain and residue information.

    Parameters
    ----------
    mmcif_path : str
        Path to the MMCIF file.

    Returns
    -------
    model : Bio.PDB.Model
        The first model from the parsed MMCIF file.
    chains : list of str
        List of chain identifiers.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.
    """
    parser = MMCIFParser()
    structure = parser.get_structure("structure", mmcif_path)
    residue_dict = {}
    for model in structure:
        for chain in model:
            chain_id = chain.id
            residue_dict[chain_id] = {}
            for residue in chain:
                res_name = residue.resname
                res_num = residue.id[1]
                if res_name in res_types:
                    residue_dict[chain_id][res_num] = res_name
        break
    chains = list(residue_dict.keys())
    return model, chains, residue_dict




[docs]
def calculate_calpha_distogram(chain_id, chain, residue_dict):
    """
    Calculates the C-alpha distogram for a protein chain.

    Parameters
    ----------
    chain_id : str
        Chain identifier.
    chain : Bio.PDB.Chain
        Chain object from the parsed structure.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.

    Returns
    -------
    residue_length : int
        Number of residues in the chain.
    distogram : numpy.ndarray
        A 2D matrix representing the pairwise C-alpha distances.
    """
    residue_length = np.array(list(residue_dict[chain_id].keys())).max()
    calpha_coords = np.zeros([residue_length, 3])
    
    for residue in chain:
        res_num = residue.get_id()[1] - 1
        res_name = residue.resname
        
        if res_num >= residue_length:
            continue

        if res_name in res_types:
            if "CA" in residue:
                calpha_coords[res_num] = residue["CA"].get_coord()
            else:
                calpha_coords[res_num] = np.array([np.nan, np.nan, np.nan])
        else:
            calpha_coords[res_num] = np.array([np.nan, np.nan, np.nan])
    
    distogram = squareform(pdist(calpha_coords, 'euclidean'))
    return residue_length, distogram




[docs]
def calculate_lys_leu_map(chain_id, chain, residue_dict):
    """
    Generates a binary map indicating the presence of Lysine (LYS) or Leucine (LEU) residues.

    Parameters
    ----------
    chain_id : str
        Chain identifier.
    chain : Bio.PDB.Chain
        Chain object from the parsed structure.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.

    Returns
    -------
    residue_length : int
        Number of residues in the chain.
    lys_leu_map : numpy.ndarray
        A 2D boolean array where True indicates the presence of LYS or LEU at corresponding residue positions.
    """
    residue_length = np.array(list(residue_dict[chain_id].keys())).max()
    lys_leu_map = np.zeros(residue_length)
    
    for res_num in residue_dict[chain_id]:
        res_name = residue_dict[chain_id][res_num]
        res_num = res_num - 1
        if res_name in ['LYS', 'LEU']:
            lys_leu_map[res_num] = True
        else:
            lys_leu_map[res_num] = False
    
    lys_leu_map = np.logical_or(lys_leu_map[None], lys_leu_map[:, None])
    return residue_length, lys_leu_map




[docs]
def load_list_from_file(file_path):
    """
    Loads a list from a pickle file.

    Parameters
    ----------
    file_path : str
        Path to the pickle file.

    Returns
    -------
    data_list : list
        List of data loaded from the pickle file.
    """
    with open(file_path, 'rb') as f:
        data_list = pickle.load(f)
    return data_list




[docs]
def calculate_sa_map(chain_id, chain, residue_dict, solvent_raw_datas):
    """
    Calculates the solvent accessibility (SA) map for a protein chain.

    Parameters
    ----------
    chain_id : str
        Chain identifier.
    chain : Bio.PDB.Chain
        Chain object from the parsed structure.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.
    solvent_raw_datas : list
        List containing raw solvent accessibility data for each residue.

    Returns
    -------
    residue_length : int
        Number of residues in the chain.
    sa_map : numpy.ndarray
        A 2D array representing the solvent accessibility map, with values between 0 and 1.
    """
    residue_length = np.array(list(residue_dict[chain_id].keys())).max()
    sa_accs = np.zeros(residue_length)
    sa_accs[:] = np.nan
    sa_map = np.zeros([residue_length, residue_length])
    
    for res_num in residue_dict[chain_id]:
        res_type = residue_dict[chain_id][res_num]
        res_num = res_num - 1
        if len(solvent_raw_datas) > res_num:
            sa_accs[res_num] = float(solvent_raw_datas[res_num][3]) / SA_maximum_map[res_type]
    
    for i in range(residue_length):
        for j in range(residue_length):
            sa_map[i, j] = np.min([sa_accs[i], sa_accs[j]])
    return residue_length, sa_map




[docs]
def calculate_tryptic_map(chain_id, chain, residue_dict):
    """
    Generates a tryptic map based on the presence of Lysine (LYS) or Arginine (ARG).

    Parameters
    ----------
    chain_id : str
        Chain identifier.
    chain : Bio.PDB.Chain
        Chain object from the parsed structure.
    residue_dict : dict
        Dictionary mapping chain IDs to residues and their positions.

    Returns
    -------
    residue_length : int
        Number of residues in the chain.
    tryptic_map : numpy.ndarray
        A 2D boolean array where True indicates tryptic cleavage points between residues.
    """
    residue_length = np.array(list(residue_dict[chain_id].keys())).max()
    tryptic_num = np.zeros([residue_length])
    tryptic_map = np.zeros([residue_length, residue_length])
    
    for res_num in residue_dict[chain_id].keys():
        res_name = residue_dict[chain_id][res_num]
        res_num = res_num - 1
        if res_name in ['LYS', 'ARG']:
            tryptic_num[res_num+1:] += 1
    for i in range(residue_length):
        for j in range(residue_length):
            tryptic_map[i, j] = np.abs(tryptic_num[i] - tryptic_num[j]) > 1
    return residue_length, tryptic_map




[docs]
def plot_cross_link(residue_length, distogram, lys_leu_map, sa_map, tryptic_map):
    """
    Plots cross-linking analysis based on distance, LYS_LEU map, solvent accessibility, and tryptic map.

    Parameters
    ----------
    residue_length : int
        Number of residues in the chain.
    distogram : numpy.ndarray
        Pairwise C-alpha distance matrix.
    lys_leu_map : numpy.ndarray
        LYS and LEU residue map.
    sa_map : numpy.ndarray
        Solvent accessibility map.
    tryptic_map : numpy.ndarray
        Tryptic cleavage map.
    """
    plt.figure(figsize=(20, 5), facecolor='white')

    plt.subplot(1, 5, 1)
    plt.title('Ca Distogram, < 10 Å')
    plt.imshow(distogram < 10)

    plt.subplot(1, 5, 2)
    plt.title('LYS_LEU_MAP')
    plt.imshow(lys_leu_map, cmap='binary')

    plt.subplot(1, 5, 3)
    plt.title('Solvent_Accs > 0.5')
    plt.imshow(sa_map > 0.5, cmap='binary')

    plt.subplot(1, 5, 4)
    plt.title('Tryptic map')
    plt.imshow(tryptic_map, cmap='binary')

    distogram = distogram * lys_leu_map * (sa_map > 0.5) * tryptic_map
    plt.subplot(1, 5, 5)
    plt.title('CrossLink Candidates')
    plt.imshow(np.logical_and(distogram > 5, distogram < 25), cmap='binary')

    plt.tight_layout()
    plt.show()