# Copyright 2021 DeepMind Technologies Limited# Copyright 2022 AlQuraishi Laboratory# Copyright 2023 NVIDIA CORPORATION# Copyright 2024 DeepFold Team## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""A Python wrapper for Kalign."""importloggingimportshutilimportsubprocessimporttempfilefrompathlibimportPathfromtypingimportListlogger=logging.getLogger(__name__)
[docs]classKalign:"""Python wrapper of the Kalign binary."""def__init__(self,binary_path:str,verbose:bool=False)->None:"""Initializes the Python Kalign wrapper. Args: binary_path: The path to the Kalign binary. verbose: Whether to print relevant information. """self.binary_path=binary_pathself.verbose=verbose
[docs]defalign(self,sequences:List[str])->str:"""Aligns the sequences and returns the alignment in A3M string. Args: sequences: A list of query sequence strings. The sequences have to be at least 6 residues long (Kalign requires this). Note that the order in which you give the sequences might alter the output slightly as different alignment tree might get constructed. Returns: A string with the alignment in a3m format. Raises: RuntimeError: If Kalign fails. ValueError: If any of the sequences is less than 6 residues long. """ifself.verbose:logger.info(f"Kalign input: {sequences}")assertlen(sequences)>=2forsinsequences:iflen(s)<6:raiseValueError("Kalign requires all sequences to be at least 6 residues long."f" Sequence={repr(s)} has {len(s)} residues.")tmp_dirpath=Path(tempfile.mkdtemp(dir=Path("/tmp")))input_fasta_path=tmp_dirpath/"input.fasta"output_a3m_path=tmp_dirpath/"output.a3m"withopen(input_fasta_path,"w")asf:f.write(_to_a3m(sequences))kalign_cmd=[self.binary_path,"-i",str(input_fasta_path),"-o",str(output_a3m_path),"-format","fasta",]ifself.verbose:logger.info(f"Kalign command: {kalign_cmd}")process=subprocess.Popen(args=kalign_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE,)try:retcode=process.wait(timeout=1.0)exceptsubprocess.TimeoutExpired:print("Kalign timeout expired!")raiseRuntimeError("Kalign timeout expired!\n")stdout,stderr=process.communicate()ifself.verbose:logger.info(f"Kalign stdout:\n{stdout.decode('utf-8')}\n"f"Kalign stderr:\n{stderr.decode('utf-8')}\n")ifretcode!=0:shutil.rmtree(tmp_dirpath,ignore_errors=True)raiseRuntimeError("Kalign error!\n"f"Kalign stdout:\n{stdout.decode('utf-8')}\n"f"Kalign stderr:\n{stderr.decode('utf-8')}\n")withopen(output_a3m_path)asf:a3m=f.read()ifself.verbose:logger.info(f"Kalign output: {a3m}")shutil.rmtree(tmp_dirpath,ignore_errors=True)returna3m
def_to_a3m(sequences:List[str])->str:"""Converts sequences to an a3m file."""names=[f"sequence {i}"foriinrange(1,len(sequences)+1)]a3m=[]forsequence,nameinzip(sequences,names):a3m.append(">"+name+"\n")a3m.append(sequence+"\n")return"".join(a3m)