Source code for bis_protein_structure.DATAGEN

from Bio import SeqIO
from Bio.PDB import MMCIFParser, MMCIFIO
from Bio.PDB import PDBParser, PDBIO, Select
from Bio.PDB import Dice
from Bio.PDB.DSSP import DSSP


import concurrent.futures
from multiprocessing import Pool

[docs] def parse_mmcif(path, file_id, chain_id, alignment_dir): """ Parses an MMCIF file and processes it into structured data. Parameters ---------- path : str Path to the MMCIF file. file_id : str Unique identifier for the MMCIF file. chain_id : str Chain ID to process from the MMCIF file. alignment_dir : str Directory where the alignments are stored. Returns ------- data : dict Processed data from the MMCIF file. Raises ------ Exception If an error occurs during parsing or if the MMCIF object is None. """ with open(path, 'r') as f: mmcif_string = f.read() mmcif_object = mmcif_parsing.parse( file_id=file_id, mmcif_string=mmcif_string) if mmcif_object.mmcif_object is None: raise list(mmcif_object.errors.values())[0] mmcif_object = mmcif_object.mmcif_object data = data_pipeline.process_mmcif( mmcif=mmcif_object, alignment_dir=alignment_dir, chain_id=chain_id) return data
[docs] def generate_feature_dict(tag, seq, fasta_path, alignment_dir): """ Generates a feature dictionary from the given sequence and alignment. Parameters ---------- tag : str Sequence identifier tag. seq : str The amino acid sequence. fasta_path : str Path to the FASTA file. alignment_dir : str Directory where alignments are stored. Returns ------- feature_dict : dict Dictionary of sequence features. """ local_alignment_dir = os.path.join(alignment_dir, tag) feature_dict = data_processor.process_fasta( fasta_path=fasta_path, alignment_dir=local_alignment_dir) return feature_dict
[docs] def parse_fasta(data): """ Parses the contents of a FASTA file and extracts sequence tags and sequences. Parameters ---------- data : str String content of the FASTA file. Returns ------- tags : list of str List of sequence tags. seqs : list of str List of sequences corresponding to the tags. """ data = re.sub('>$', '', data, flags=re.M) lines = [ l.replace('\n', '') for prot in data.split('>') for l in prot.strip().split('\n', 1) ][1:] tags, seqs = lines[::2], lines[1::2] tags = [t.split()[0] for t in tags] return tags, seqs
[docs] def read_fasta(file_path): """ Reads a FASTA file and prints out each sequence ID and sequence. Parameters ---------- file_path : str Path to the FASTA file. Returns ------- None """ with open(file_path, 'r') as fasta_file: sequence_id = None sequence = '' for line in fasta_file: line = line.strip() if line.startswith('>'): if sequence_id is not None: print(f'Sequence ID: {sequence_id}') print(f'Sequence: {sequence}') print() # Add an empty line between sequences sequence_id = line[1:] sequence = '' else: sequence += line if sequence_id is not None: print(f'>{sequence_id}') print(f'{sequence}')
[docs] class AllResiduesSelector(Select): """ Selector class for PDBIO to select all residues in a specific chain. Parameters ---------- target_chain_id : str The ID of the target chain to select residues from. Methods ------- accept_residue(residue) Returns True if the residue belongs to the target chain. """ def __init__(self, target_chain_id): self.target_chain_id = target_chain_id
[docs] def accept_residue(self, residue): return residue.get_parent().id == self.target_chain_id
[docs] def mmcif_to_pdbs(input_mmcif_file, output_pdb_root): """ Converts an MMCIF file to separate PDB files for each chain. Parameters ---------- input_mmcif_file : str Path to the input MMCIF file. output_pdb_root : str Directory where the output PDB files will be stored. Returns ------- None """ cif_name = input_mmcif_file.split('/')[-1].split('.cif')[0] mmcif_parser = MMCIFParser(QUIET=True) structure = mmcif_parser.get_structure("structure", input_mmcif_file) pdb_io = PDBIO() for model in structure: for chain in model: output_pdb_file = os.path.join(output_pdb_root, f'{cif_name}_{chain.id}.pdb') pdb_io.set_structure(structure) pdb_io.save(output_pdb_file, AllResiduesSelector(chain.id))
[docs] def process_mmcif_to_pdbs(mmcif, mmcif_root, output_root): """ Processes an MMCIF file and converts it to PDB format. Parameters ---------- mmcif : str Name of the MMCIF file. mmcif_root : str Directory where the MMCIF files are stored. output_root : str Directory where the PDB files will be saved. Returns ------- None """ try: mmcif_path = os.path.join(mmcif_root, mmcif) mmcif_to_pdbs(mmcif_path, output_root) except Exception as e: print(f"Error processing {mmcif}: {e}")
[docs] def parallel_processing(mmcifs, mmcif_root, output_root): """ Processes multiple MMCIF files in parallel and converts them to PDB files. Parameters ---------- mmcifs : list of str List of MMCIF file names to process. mmcif_root : str Directory where the MMCIF files are stored. output_root : str Directory where the PDB files will be saved. Returns ------- None """ with Pool() as pool: pool.starmap(process_mmcif_to_pdbs, [(mmcif, mmcif_root, output_root) for mmcif in mmcifs])
[docs] def create_structure_from_feature(sequence, all_atom_positions, all_atom_mask, structure_id="pred", model_id=0, chain_id="A"): """ Creates a structure from sequence and atomic position information. Parameters ---------- sequence : str Amino acid sequence of the protein. all_atom_positions : numpy.ndarray Array of atomic positions for the protein. all_atom_mask : numpy.ndarray Mask indicating valid atoms in the structure. structure_id : str, optional Identifier for the structure (default is 'pred'). model_id : int, optional Model ID for the structure (default is 0). chain_id : str, optional Chain ID for the structure (default is 'A'). Returns ------- structure : Bio.PDB.Structure.Structure Generated structure object containing atomic coordinates. """ structure = Structure.Structure(structure_id) model = Model.Model(model_id) chain = Chain.Chain(chain_id) for i in range(len(sequence)): residue_id = (' ', i + 1, ' ') residue = Residue.Residue(residue_id, sequence[i], '') for j in range(all_atom_positions.shape[1]): if all_atom_mask[i, j] == 1: atom_name = index_to_atom_name.get(j, f"X{j + 1}") atom_coords = all_atom_positions[i, j] atom = Atom.Atom(atom_name, atom_coords, 1.0, 1.0, '', atom_name, j + 1, 'C') residue.add(atom) chain.add(residue) model.add(chain) structure.add(model) sr = ShrakeRupley() sr.compute(structure, level="R") for res in structure.get_residues(): if 'EXP_NACCESS' in res.xtra: res.sasa = res.xtra['EXP_NACCESS'] else: res.sasa = None return structure