Source code for bis_protein_structure.DISULFIDE

[docs] def readMMCIF_label(mmcif_path): """ Reads an MMCIF file and extracts chain and residue information. Parameters ---------- mmcif_path : str Path to the MMCIF file. Returns ------- model : Bio.PDB.Model The first model from the parsed structure. chains : list of str List of chain IDs present in the structure. residue_dict : dict Dictionary mapping each chain to a dictionary of residue numbers and residue names. Format: {chain_id: {res_num: res_name}}. residue_idx_edit : dict Dictionary that maps chain IDs to dictionaries which map residue numbers to indices based on their sequence order. Format: {chain_id: {res_num: index}}. Notes ----- The function parses the MMCIF file, extracts chain and residue information, and then aligns the residue numbering with the label sequence identifiers provided in the MMCIF file. Only standard amino acids are included, and the sequence numbering is adjusted to fit a zero-based index for further processing. """ parser = MMCIFParser(auth_residues=False, QUIET=True) structure = parser.get_structure("structure", mmcif_path) residue_dict = {} for model in structure: for chain in model: chain_id = chain.id residue_dict[chain_id] = {} for residue in chain: res_name = residue.resname res_num = residue.id[1] if res_num in residue_dict[chain_id].keys(): continue if (res_name in res_types) and (residue.id[0]==' '): residue_dict[chain_id][res_num] = res_name break pdb_dict = MMCIF2Dict(mmcif_path) ATOM_CHECK = pdb_dict.get('_atom_site.group_PDB') auth_CHAIN_ID = pdb_dict.get('_atom_site.auth_asym_id') SEQUENCE_ID = pdb_dict.get('_atom_site.label_seq_id') chains = list(residue_dict.keys()) residue_idx_edit = {} for c_id in chains: c = c_id residue_idx_edit[c] = {} if len(residue_dict[c].keys()) == 0 : continue idx_info_dict = {} for atom_check, chain_check, label_idx in zip(ATOM_CHECK, auth_CHAIN_ID, SEQUENCE_ID): if (atom_check == 'ATOM') and (chain_check == c): if label_idx not in idx_info_dict.keys(): idx_info_dict[int(label_idx)] = int(label_idx) - 1 for r in sorted(list(residue_dict[c].keys())): idx = int(idx_info_dict[r]) r_name = residue_dict[c][r] residue_idx_edit[c][r] = idx return model, chains, residue_dict, residue_idx_edit
[docs] def get_SS_ver5(chain_id, chain, res_length, residue_idx_edit, criteria): """ Extracts the spatial coordinates of the alpha carbon (CA) or sulfur gamma (SG) atoms of residues in a protein chain and computes pairwise distances. Parameters ---------- chain_id : str Chain ID to process. chain : Bio.PDB.Chain The chain object containing residues to process. res_length : int The total number of residues in the chain. residue_idx_edit : dict A dictionary mapping residue numbers to indices for the chain. criteria : str The atom type to use for distance calculations. Valid options are 'CA' (alpha carbon) and 'SG' (sulfur gamma). Returns ------- filter_disto : numpy.ndarray A filtered distogram (distance matrix) that contains distances between Cysteine residues that meet the distance criteria. The shape is (res_length, res_length, 1). pair_list : numpy.ndarray List of residue index pairs where the distance between Cysteine residues satisfies the specified distance criteria. Notes ----- - The function computes the pairwise Euclidean distance between alpha carbons (CA) or sulfur gamma atoms (SG) in the given chain. - For the 'CA' criteria, the distance threshold is set between 3.0 and 7.5 Å, while for 'SG' criteria, it is set between 2.0 and 3.0 Å. - Only Cysteine residues are considered for the distance calculation in the case of 'SG' criteria. """ residue_length = int(res_length) calpha_coords = np.zeros([residue_length,3]) sgamma_coords = np.zeros([residue_length,3]) Cys_list = list() for residue in chain: if residue.id[0]!=' ': continue res_num_b = residue.id[1] res_name = residue.resname if res_name not in res_types: continue res_num = residue_idx_edit[chain_id][res_num_b] # For circulation if res_name in res_types.keys(): # for distogram if "CA" in residue: calpha_coords[res_num] = residue["CA"].get_coord() # for cysteine index if res_name == 'CYS': Cys_list.append(res_num) # for condition of S if criteria == 'SG': if "SG" in residue: sgamma_coords[res_num] = residue["SG"].get_coord() real_distogram = squareform(pdist(calpha_coords, 'euclidean')) # for disulfide feature if criteria == 'CA': min_dist, max_dist = 3.0, 7.5 distogram = real_distogram elif criteria == 'SG': min_dist, max_dist = 2.0, 3.0 distogram = squareform(pdist(sgamma_coords, 'euclidean')) Cys_mask1 = np.zeros((distogram.shape)) Cys_mask2 = np.zeros((distogram.shape)) if len(Cys_list) != 0: Cys_mask1[np.array(Cys_list),:] = 1 Cys_mask2[:,np.array(Cys_list)] = 1 Cys_mask = Cys_mask1 * Cys_mask2 filter_disto = distogram * Cys_mask filter_disto = np.where(filter_disto < min_dist, 0, filter_disto) filter_disto = np.where(filter_disto > max_dist, 0, filter_disto) # Reshape the matrix filter_disto = filter_disto[:,:,None] # for cysteine pair pair_list = list() for index_i, index_j in zip(np.where(filter_disto != 0)[0], np.where(filter_disto != 0)[1]): pair = [int(index_i), int(index_j)] if pair not in pair_list and list(reversed(pair)) not in pair_list: pair_list.append(pair) pair_list = np.array(pair_list) return filter_disto, pair_list