Source code for deepfold.utils.feats_utils
import numpy as np
from deepfold.data.search.parsers import parse_fasta
[docs]
def crop_features(feats: dict, start: int, end: int):
"""
Args:
[start, end] -> range(start - 1, end)
"""
mask = np.arange(start - 1, end, dtype="int")
new_feats = {}
new_feats["domain_name"] = feats["domain_name"].copy()
new_feats["template_domain_names"] = feats["template_domain_names"].copy()
new_feats["template_sum_probs"] = feats["template_sum_probs"].copy()
new_feats["aatype"] = feats["aatype"][mask, :]
new_feats["between_segment_residues"] = feats["between_segment_residues"][mask]
new_feats["residue_index"] = feats["residue_index"][mask]
new_msa = feats["msa"][:, mask]
msa_mask = ~np.all(new_msa == 21, axis=1) # All-gap
new_feats["msa"] = new_msa[msa_mask]
new_feats["deletion_matrix_int"] = feats["deletion_matrix_int"][new_msa, mask]
new_feats["num_alignments"] = feats["num_alignments"][mask]
new_feats["num_alignments"].fill(len(msa_mask))
new_feats["template_aatype"] = feats["template_aatype"][:, mask, :]
new_feats["template_all_atom_positions"] = feats["template_all_atom_positions"][:, mask, :, :]
new_feats["template_all_atom_mask"] = feats["template_all_atom_mask"][:, mask, :]
new_feats["seq_length"] = feats["seq_length"][mask]
new_feats["seq_length"].fill(end - start + 1)
new_feats["sequence"] = np.array([feats["sequence"].item()[start - 1 : end]], dtype=np.object_)
if isinstance(feats["template_sequence"], np.ndarray):
new_feats["template_sequence"] = [s[start - 1 : end] for s in feats["template_sequence"]]
else:
new_feats["template_sequence"] = [s[start - 1 : end] for s in feats["template_sequence"]]
return new_feats
[docs]
def crop_a3m_string(a3m_string: str, start: int, end: int) -> str:
assert start > 0
assert end >= start
sequences, descriptions = parse_fasta(a3m_string)
subsequences = []
for seq in sequences:
actual_start = -1
actual_end = -1
count = 0
for idx, char in enumerate(seq):
if count == start - 1:
actual_start = idx
if count == end - 1:
actual_end = idx
break
if char.isupper() or char == "-":
count += 1
assert actual_start != -1
assert actual_end != -1
subsequences.append(seq[actual_start : actual_end + 1])
lines = []
for seq, desc in zip(subsequences, descriptions):
lines.append(f">{desc}")
lines.append(seq)
return "\n".join(lines)