[docs]defreadMMCIF_label(mmcif_path):""" Reads an MMCIF file and extracts chain and residue information. Parameters ---------- mmcif_path : str Path to the MMCIF file. Returns ------- model : Bio.PDB.Model The first model from the parsed structure. chains : list of str List of chain IDs present in the structure. residue_dict : dict Dictionary mapping each chain to a dictionary of residue numbers and residue names. Format: {chain_id: {res_num: res_name}}. residue_idx_edit : dict Dictionary that maps chain IDs to dictionaries which map residue numbers to indices based on their sequence order. Format: {chain_id: {res_num: index}}. Notes ----- The function parses the MMCIF file, extracts chain and residue information, and then aligns the residue numbering with the label sequence identifiers provided in the MMCIF file. Only standard amino acids are included, and the sequence numbering is adjusted to fit a zero-based index for further processing. """parser=MMCIFParser(auth_residues=False,QUIET=True)structure=parser.get_structure("structure",mmcif_path)residue_dict={}formodelinstructure:forchaininmodel:chain_id=chain.idresidue_dict[chain_id]={}forresidueinchain:res_name=residue.resnameres_num=residue.id[1]ifres_numinresidue_dict[chain_id].keys():continueif(res_nameinres_types)and(residue.id[0]==' '):residue_dict[chain_id][res_num]=res_namebreakpdb_dict=MMCIF2Dict(mmcif_path)ATOM_CHECK=pdb_dict.get('_atom_site.group_PDB')auth_CHAIN_ID=pdb_dict.get('_atom_site.auth_asym_id')SEQUENCE_ID=pdb_dict.get('_atom_site.label_seq_id')chains=list(residue_dict.keys())residue_idx_edit={}forc_idinchains:c=c_idresidue_idx_edit[c]={}iflen(residue_dict[c].keys())==0:continueidx_info_dict={}foratom_check,chain_check,label_idxinzip(ATOM_CHECK,auth_CHAIN_ID,SEQUENCE_ID):if(atom_check=='ATOM')and(chain_check==c):iflabel_idxnotinidx_info_dict.keys():idx_info_dict[int(label_idx)]=int(label_idx)-1forrinsorted(list(residue_dict[c].keys())):idx=int(idx_info_dict[r])r_name=residue_dict[c][r]residue_idx_edit[c][r]=idxreturnmodel,chains,residue_dict,residue_idx_edit
[docs]defget_SS_ver5(chain_id,chain,res_length,residue_idx_edit,criteria):""" Extracts the spatial coordinates of the alpha carbon (CA) or sulfur gamma (SG) atoms of residues in a protein chain and computes pairwise distances. Parameters ---------- chain_id : str Chain ID to process. chain : Bio.PDB.Chain The chain object containing residues to process. res_length : int The total number of residues in the chain. residue_idx_edit : dict A dictionary mapping residue numbers to indices for the chain. criteria : str The atom type to use for distance calculations. Valid options are 'CA' (alpha carbon) and 'SG' (sulfur gamma). Returns ------- filter_disto : numpy.ndarray A filtered distogram (distance matrix) that contains distances between Cysteine residues that meet the distance criteria. The shape is (res_length, res_length, 1). pair_list : numpy.ndarray List of residue index pairs where the distance between Cysteine residues satisfies the specified distance criteria. Notes ----- - The function computes the pairwise Euclidean distance between alpha carbons (CA) or sulfur gamma atoms (SG) in the given chain. - For the 'CA' criteria, the distance threshold is set between 3.0 and 7.5 Å, while for 'SG' criteria, it is set between 2.0 and 3.0 Å. - Only Cysteine residues are considered for the distance calculation in the case of 'SG' criteria. """residue_length=int(res_length)calpha_coords=np.zeros([residue_length,3])sgamma_coords=np.zeros([residue_length,3])Cys_list=list()forresidueinchain:ifresidue.id[0]!=' ':continueres_num_b=residue.id[1]res_name=residue.resnameifres_namenotinres_types:continueres_num=residue_idx_edit[chain_id][res_num_b]# For circulationifres_nameinres_types.keys():# for distogramif"CA"inresidue:calpha_coords[res_num]=residue["CA"].get_coord()# for cysteine indexifres_name=='CYS':Cys_list.append(res_num)# for condition of Sifcriteria=='SG':if"SG"inresidue:sgamma_coords[res_num]=residue["SG"].get_coord()real_distogram=squareform(pdist(calpha_coords,'euclidean'))# for disulfide featureifcriteria=='CA':min_dist,max_dist=3.0,7.5distogram=real_distogramelifcriteria=='SG':min_dist,max_dist=2.0,3.0distogram=squareform(pdist(sgamma_coords,'euclidean'))Cys_mask1=np.zeros((distogram.shape))Cys_mask2=np.zeros((distogram.shape))iflen(Cys_list)!=0:Cys_mask1[np.array(Cys_list),:]=1Cys_mask2[:,np.array(Cys_list)]=1Cys_mask=Cys_mask1*Cys_mask2filter_disto=distogram*Cys_maskfilter_disto=np.where(filter_disto<min_dist,0,filter_disto)filter_disto=np.where(filter_disto>max_dist,0,filter_disto)# Reshape the matrixfilter_disto=filter_disto[:,:,None]# for cysteine pair pair_list=list()forindex_i,index_jinzip(np.where(filter_disto!=0)[0],np.where(filter_disto!=0)[1]):pair=[int(index_i),int(index_j)]ifpairnotinpair_listandlist(reversed(pair))notinpair_list:pair_list.append(pair)pair_list=np.array(pair_list)returnfilter_disto,pair_list