Source code for deepfold.data.search.msa_identifiers
# Copyright 2024 DeepFold Team"""Utilities for extracting identifiers from MSA sequence desecriptions."""importre# Sequence coming from UniProtKB database:_UNIPROT_PATTERN=re.compile(r""" ^ (?:tr|sp) \| (?P<UniqueIdentifier>[A-Za-z0-9_]+) \| (?P<EntryName>[A-Za-z0-9_]+) \/ .* """,re.VERBOSE,)# PLM_VDB_PATTERN=re.compile(r"^.*taxId=(?P<TaxId>[1-9][0-9]+).*")# @dataclasses.dataclass(frozen=True)# class Identifier:# tax_id: str = ""Identifier=strdef_parse_sequence_identifier(msa_sequence_identifier:str)->Identifier:"""Gets species from an MSA sequence identifier."""matches=re.search(_UNIPROT_PATTERN,msa_sequence_identifier.strip())ifmatches:return"uniprot:"+matches.group("EntryName").split("_")[-1]else:matches=re.search(_VDB_PATTERN,msa_sequence_identifier.strip())ifmatches:return"vdb:"+matches.group("TaxId")[-1]returnIdentifier()
[docs]defget_identifiers(description:str)->Identifier:"""Compute extra MSA features from the description."""return_parse_sequence_identifier(description)