Source code for deepfold.data.search.crfalign
import logging
from pathlib import Path
from typing import List
from deepfold.data.search.templates import TemplateHit
logger = logging.getLogger(__name__)
[docs]
def parse_crf(crf_string: str, query_id: str, alignment_dir: Path) -> List[TemplateHit]:
lst_crf = crf_string.strip().splitlines()
crf_hits = []
for i, chain_id in enumerate(lst_crf, start=1):
# Handle comments:
if chain_id.startswith("#"):
continue
chain_id, _, _ = chain_id.partition("#")
chain_id = chain_id.strip()
# Parse template hits:
with open(alignment_dir / f"{query_id}-{chain_id}.pir", "r") as fp:
lines = fp.read()
hit = parse_pir(lines, index=i)
crf_hits.append(hit)
return crf_hits
[docs]
def parse_pir(pir_string: str, index: int = 0) -> TemplateHit:
query_sequence = []
hit_sequence = []
query_name = None
hit_name = None
reading_query = False
reading_hit = False
sum_probs = 0.0
lines = pir_string.strip().splitlines()
for line in lines:
if line.startswith(">P1;"):
if not query_name:
query_name = line.split(";")[1].strip()
reading_query = True
reading_hit = False
else:
hit_name = line.split(";")[1].strip()
reading_query = False
reading_hit = True
elif line.startswith("structureX"):
continue
elif line.startswith("C;"):
if "probs_sum" in line:
sum_probs = float(line.split("=")[1].strip())
else:
if reading_query:
query_sequence.extend(line.strip().strip("*"))
elif reading_hit:
hit_sequence.extend(line.strip().strip("*"))
hit_sequence_str = "".join(hit_sequence) # .replace("-", "")
query_sequence_str = "".join(query_sequence) # .replace("-", "")
indices_query, indices_hit = [], []
qi, hi = 0, 0
for q, h in zip(query_sequence_str, hit_sequence_str):
if h == "-":
indices_hit.append(-1)
else:
indices_hit.append(hi)
hi += 1
if q == "-":
indices_query.append(-1)
else:
indices_query.append(qi)
qi += 1
assert hit_name
hit = TemplateHit(
index=index,
name=hit_name,
aligned_cols=len(hit_sequence_str),
sum_probs=sum_probs,
query=query_sequence_str,
hit_sequence=hit_sequence_str,
indices_query=indices_query,
indices_hit=indices_hit,
)
return hit