Edit on GitHub

structure_tools

  • This module provides utility functions and classes to work with structure files (PDB or CIF).
  • Uses Biopython for structure manipulation and parsing.
  • The structure object refers to Bio.PDB.Structure.Structure.
  • The residue object refers to Bio.PDB.Residue.Residue.
    Hence, the "residue" term is used for amino acids, nucleotides, ions, ligands.
  1"""
  2[structure_tools](https://github.com/isblab/af_pipeline/tree/main/af_pipeline/tools/structure_tools.py)
  3==================================
  4- This module provides utility functions and classes to work with structure files (PDB or CIF).
  5- Uses `Biopython` for structure manipulation and parsing.
  6- The structure object refers to `Bio.PDB.Structure.Structure`.
  7- The residue object refers to `Bio.PDB.Residue.Residue`.<br />
  8  Hence, the "residue" term is used for amino acids, nucleotides, ions, ligands.
  9"""
 10from textwrap import dedent
 11import warnings
 12import Bio
 13import Bio.PDB
 14import Bio.PDB.Structure
 15import Bio.PDB.Residue
 16import Bio.PDB.Atom
 17import numpy as np
 18from Bio.PDB.Chain import Chain
 19from Bio.PDB.mmcifio import MMCIFIO
 20from Bio.PDB.PDBIO import Select, PDBIO
 21from collections import defaultdict
 22from typing import Dict, Any, overload
 23from af_pipeline.constants.af_constants import (
 24    PROTEIN_ENTITIES,
 25    DNA_ENTITIES,
 26    RNA_ENTITIES,
 27    ION,
 28    PURINES,
 29    PYRIMIDINES,
 30    ALLOWED_PTMS,
 31    ALLOWED_DNA_MODS,
 32    ALLOWED_RNA_MODS,
 33    ALLOWED_LIGANDS,
 34    ONLY_CA_RESIDUES,
 35    AtomDecoration,
 36    EntityType,
 37    FileFormat,
 38    InteractionMapType,
 39    ResidueDecoration,
 40    ResidueMapDepth,
 41    VALID_INTERACTION_MAP_TYPES,
 42    ResidueMapKeys,
 43    TokenLevel,
 44)
 45
 46def get_interaction_map(
 47    coords1: np.ndarray,
 48    coords2: np.ndarray,
 49    contact_threshold: float,
 50    map_type: InteractionMapType,
 51) -> np.ndarray:
 52    """ Create an interaction map, given the input coordinates.
 53
 54    Returns a distance map or a contact map, based on the map_type specified.
 55
 56    ## Arguments:
 57
 58    - **coords1 (np.ndarray)**:<br />
 59        Coordinates of shape (N, 3) for the first set of atoms/residues.
 60
 61    - **coords2 (np.ndarray)**:<br />
 62        Coordinates of shape (M, 3) for the second set of atoms/residues.
 63
 64    - **contact_threshold (float)**:<br />
 65        Distance threshold to define a contact.
 66        Only used if `map_type` is `InteractionMapType.CONTACT`.
 67
 68    - **map_type (InteractionMapType)**:<br />
 69        Type of interaction map to create.
 70        Can be either `InteractionMapType.DISTANCE` or `InteractionMapType.CONTACT`.
 71
 72         - `InteractionMapType.DISTANCE`:
 73            Returns a distance map of shape (N, M)
 74            where each element (i, j) is the distance between the i-th atom/residue
 75            in `coords1` and the j-th atom/residue in `coords2`.
 76
 77         - `InteractionMapType.CONTACT`:
 78            Returns a binary contact map of shape (N, M)
 79            where each element (i, j) is 1 if the distance between the i-th atom/residue
 80            in `coords1` and the j-th atom/residue in `coords2` is less than or equal to
 81            `contact_threshold`, and 0 otherwise.
 82
 83    ## Returns:
 84
 85    - **np.ndarray**:<br />
 86        The interaction map as a numpy array.
 87    """
 88
 89    if map_type not in VALID_INTERACTION_MAP_TYPES:
 90        raise ValueError(
 91            f"Invalid map_type specified."\
 92            f"Expected one of {VALID_INTERACTION_MAP_TYPES}, got {map_type}"
 93        )
 94
 95    distance_map = get_distance_map(coords1, coords2)
 96
 97    if map_type == InteractionMapType.DISTANCE:
 98        return distance_map
 99
100    elif map_type == InteractionMapType.CONTACT:
101        contact_map = get_contact_map(distance_map, contact_threshold)
102        return contact_map
103
104def get_distance_map(coords1: np.ndarray, coords2: np.ndarray):
105    """ Create an all-v-all distance map.
106
107    Returns a matrix of distances between all pairs of atoms/residues in the
108    two sets of coordinates.
109
110    ## Arguments:
111
112    - **coords1 (np.ndarray)**:<br />
113        Coordinates of shape (N, 3) for the first set of atoms/residues.
114
115    - **coords2 (np.ndarray)**:<br />
116        Coordinates of shape (M, 3) for the second set of atoms/residues.
117
118    ## Returns:
119
120    - **np.ndarray**:<br />
121        A matrix of shape (N, M) where each element (i, j) is the Euclidean distance
122        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`.
123    """
124
125    from scipy.spatial import distance_matrix
126
127    distance_map = distance_matrix(coords1, coords2)
128
129    return distance_map
130
131def get_contact_map(distance_map: np.ndarray, contact_threshold: float):
132    """ Given the distance map, create a binary contact map by thresholding distances.
133
134    Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact.
135
136    ## Arguments:
137
138    - **distance_map (np.ndarray)**:<br />
139        A matrix of shape (N, M) where each element (i, j) is the Euclidean distance
140        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`.
141
142    - **contact_threshold (float)**:<br />
143        Distance threshold to define a contact. If the distance between two atoms/residues
144        is less than or equal to this threshold, they are considered to be in contact.
145
146    ## Returns:
147
148    - **np.ndarray**:<br />
149        A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance
150        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`
151        is less than or equal to `contact_threshold`, and 0 otherwise.
152    """
153
154    contact_map = np.where(distance_map <= contact_threshold, 1, 0)
155
156    return contact_map
157
158def get_token_level(residue: Bio.PDB.Residue.Residue) -> TokenLevel:
159    """ Get the token level of the residue.
160
161    Assuming the `residue` object is decorated with the appropriate attributes.
162    - `tokenLevel`: `TokenLevel.ATOM` if the residue has per-atom token, `TokenLevel.RESIDUE` otherwise.
163
164    Arguments:
165
166    - **residue (Bio.PDB.Residue.Residue)**:<br />
167        Biopython residue object.
168
169    Returns:
170
171    - **token_level (TokenLevel)**:<br />
172        Token level of the residue.
173    """
174
175    token_level = residue.xtra.get(
176        ResidueDecoration.TOKEN_LEVEL, TokenLevel.RESIDUE
177    )
178
179    return token_level
180
181def has_modifications(structure: Bio.PDB.Structure.Structure) -> bool:
182    """ Check if the structure has any modified residues.
183
184    Assuming the `residue` objects are decorated with the appropriate attributes.
185    - `is_modified`: `True` if the `residue` is modified.
186
187    Arguments:
188
189    - **structure (Bio.PDB.Structure.Structure)**:<br />
190        Biopython structure object.
191
192    Returns:
193
194    - **condition (bool)**:<br />
195        `True` if the structure has any modified residues, `False` otherwise.
196    """
197
198    for residue in structure.get_residues():
199        if residue.xtra.get(ResidueDecoration.IS_MODIFIED, False):
200            return True
201
202    return False
203
204def save_structure_obj(
205    structure: Bio.PDB.Structure.Structure,
206    out_file: str,
207    res_select_obj: Select = Select(),
208    save_type: str = FileFormat.CIF,
209    preserve_header_footer = False,
210):
211    """Save the selection in Biopython structure object as a PDB or CIF file.
212
213    Arguments:
214
215    - **structure (Bio.PDB.Structure.Structure)**:<br />
216        Biopython structure object to save.
217
218    - **out_file (str)**:<br />
219        Path to the output file where the structure will be saved.
220
221    - **res_select_obj (Bio.PDB.Select, optional)**:<br />
222        Biopython Select object to filter the residues to save.
223        Defaults to `Select()` which saves all residues.
224
225    - **save_type (str, optional)**:<br />
226        Type of file to save the structure as.
227        Can be either "pdb" or "cif".
228
229    - **preserve_header_footer (bool, optional)**:<br />
230        If `True`, the header and footer information from the structure
231        will be preserved in the saved file.
232        > [!NOTE]
233        > The header and footer information can only be preserved for CIF files.
234    """
235
236    if save_type == FileFormat.PDB:
237
238        io = PDBIO()
239        io.set_structure(structure)
240        io.save(out_file, res_select_obj)
241
242        if preserve_header_footer:
243            warnings.warn(
244                """
245
246                PDB files do not support headers and footers.
247                Saving without headers and footers.
248                """
249            )
250
251    elif save_type == FileFormat.CIF:
252
253        io = MMCIFIO()
254        io.set_structure(structure)
255        io.save(out_file, res_select_obj)
256
257        if preserve_header_footer:
258
259            header_footer = structure.xtra.get("header_footer", {})
260
261            if {"header", "footer"}.issubset(set(header_footer.keys())):
262
263                with open(out_file, "r") as f:
264                    lines = f.readlines()
265
266                lines.insert(0, "\n")
267                lines.append("\n")
268
269                for header_line in header_footer["header"][::-1]:
270                    lines.insert(0, f"{header_line}")
271
272                for footer_line in header_footer["footer"][::-1]:
273                    lines.append(f"{footer_line}")
274
275                with open(out_file, "w") as f:
276                    for line in lines:
277                        f.write(line)
278
279            else:
280                warnings.warn(
281                    """
282
283                    No header or footer information found in the structure.
284                    Saving without headers and footers.
285                    """
286                )
287
288def add_header_footer(
289    structure: Bio.PDB.Structure.Structure,
290    structure_file_path: str,
291) -> Bio.PDB.Structure.Structure:
292    """Add the header and footer information to the structure object.
293
294    While parsing the `CIF` file using Biopython `MMCIFParser`,
295    the header and footer information information is not retained.<br />
296    This function extracts the header and footer information from the
297    structure file and adds it to the structure object. \n
298
299    The information is stored in the `xtra` attribute of the structure
300    object under the key `header_footer`. \n
301
302    > [!NOTE]
303    > - `PDB` format does not have header and footer information.
304    > - Using this function leads to parsing of the structure file twice. Once
305    >   to get the structure object and once to extract the header and footer information.
306
307    Arguments:
308
309    - **structure (Bio.PDB.Structure.Structure)**:<br />
310        Biopython structure object.
311
312    - **structure_file_path (str)**:<br />
313        Path to the structure file.
314
315    Returns:
316
317    - **structure (Bio.PDB.Structure.Structure)**:<br />
318        Biopython Structure object with `header_footer` added in `xtra`.
319    """
320
321    with open(structure_file_path, "r") as f:
322        lines = f.readlines()
323
324    header_info = []
325    header_section = ""
326
327    for line in lines:
328        header_section += line
329
330        if line.startswith("#"):
331            header_info.append(header_section)
332            header_section = ""
333
334        if line.startswith("_atom_site"):
335            break
336
337    footer_info = []
338    footer_section = ""
339
340    for line in lines[::-1]:
341        footer_section = line + footer_section
342
343        if line.startswith("#"):
344            footer_info.append(footer_section)
345            footer_section = ""
346
347        if line.startswith("ATOM") or line.startswith("HETATM"):
348            break
349
350    structure.xtra["header_footer"] = {
351        "header": header_info,
352        "footer": footer_info,
353    }
354
355    return structure
356
357def decorate_residue(
358    residue: Bio.PDB.Residue.Residue,
359    xtra_field: str | None = None,
360    xtra_value: Any = None,
361):
362    """Decorate the residue.
363
364    This function adds `entityType` attribute to the residue's `xtra`
365    attribute depending on the type of entity.
366
367    Allowed entity types are:
368    ```python
369    - "proteinChain"
370    - "dnaSequence"
371    - "rnaSequence"
372    - "ligand"
373    - "ion"
374    - None #(if the residue does not belong to any of the above types)
375    ```
376
377    It also adds the following flags to the residue's `xtra` wherever applicable -
378    ```python
379    - "is_modified" # boolean indicating if the residue is modified
380    - "is_ca_only" # boolean indicating if the residue has only CA atom
381    - "is_purine" # boolean indicating if the nucleotide is purine
382    - "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine
383    - "is_ion" # boolean indicating if the residue is an ion.
384    - "is_ligand" # boolean indicating if the residue is a ligand.
385    ```
386
387    Arguments:
388
389    - **residue (Bio.PDB.Residue.Residue)**:<br />
390        Biopython residue object.
391    """
392
393    symbol = residue.get_resname()
394
395    if symbol in PROTEIN_ENTITIES:
396        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.PROTEIN_CHAIN
397
398        if symbol in ALLOWED_PTMS:
399            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
400
401        if symbol in ONLY_CA_RESIDUES:
402            residue.xtra[ResidueDecoration.IS_CA_ONLY] = True
403
404    elif symbol in DNA_ENTITIES:
405        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.DNA_SEQUENCE
406
407        if symbol in ALLOWED_DNA_MODS:
408            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
409    elif symbol in RNA_ENTITIES:
410        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.RNA_SEQUENCE
411
412        if symbol in ALLOWED_RNA_MODS:
413            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
414
415    elif symbol in ALLOWED_LIGANDS:
416        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.LIGAND
417
418    elif symbol in ION:
419        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.ION
420
421    else:
422        warnings.warn(dedent(f"""
423            The residue "{symbol}" does not belong to any known entity types.
424            Setting 'entityType' to None.""")
425        )
426        residue.xtra[ResidueDecoration.ENTITY_TYPE] = None
427
428    if symbol in PURINES:
429        residue.xtra[ResidueDecoration.IS_PURINE] = True
430
431    elif symbol in PYRIMIDINES:
432        residue.xtra[ResidueDecoration.IS_PYRIMIDINE] = True
433
434    if xtra_field is not None and xtra_value is not None:
435        if xtra_field in residue.xtra:
436            warnings.warn(dedent(f"""
437                The field '{xtra_field}' already exists in the residue's xtra.
438                Overwriting the value.""")
439            )
440        residue.xtra[xtra_field] = xtra_value
441
442    condition = (
443        residue.xtra.get(ResidueDecoration.IS_MODIFIED)
444        or residue.xtra.get(ResidueDecoration.ENTITY_TYPE) in [EntityType.LIGAND, None]
445    )
446    _get_token_level = {True: TokenLevel.ATOM, False: TokenLevel.RESIDUE}
447    residue.xtra[ResidueDecoration.TOKEN_LEVEL] = _get_token_level[condition]
448
449def decorate_atom(
450    atom: Bio.PDB.Atom.Atom,
451    xtra_field: str | None = None,
452    xtra_value: Any = None,
453):
454    """ Decorate the atom.
455
456    This function adds `is_representative` attribute to the atom's `xtra`
457
458    Arguments:
459
460    - **atom (Bio.PDB.Atom.Atom)**:<br />
461        Biopython atom object.
462
463    - **xtra_field (str | None, optional)**:<br />
464        field to add to the atom's `xtra`.
465
466    - **xtra_value (Any, optional)**:<br />
467        value of the field to add to the atom's `xtra`.
468    """
469
470    symbol = atom.get_name()
471    residue = atom.get_parent()
472
473    if not isinstance(residue, Bio.PDB.Residue.Residue):
474        raise TypeError(
475            f"Expected a `Bio.PDB.Residue.Residue` object, got {type(residue)}"
476        )
477
478    residue = residue.get_resname()
479
480    residue_attrs = (
481        residue in PROTEIN_ENTITIES,
482        residue in ONLY_CA_RESIDUES,
483        residue in PURINES,
484        residue in PYRIMIDINES,
485        symbol
486    )
487
488    atom_xtra_dict = {
489        (True, False, False, False, "CB"): True,
490        (True, True, False, False, "CA"): True,
491        (False, False, True, False, "C4"): True,
492        (False, False, False, True, "C2"): True,
493    }
494
495    atom.xtra[AtomDecoration.IS_REPRESENTATIVE] = atom_xtra_dict.get(
496        residue_attrs, False
497    )
498
499    if (
500        xtra_field is not None
501        and xtra_value is not None
502        and xtra_field in atom.xtra
503    ):
504        warnings.warn(dedent(f"""
505            The field '{xtra_field}' already exists in the atom's xtra.
506            Overwriting the value.""")
507        )
508        atom.xtra[xtra_field] = xtra_value
509
510
511class RenumberResidues:
512    """Class to renumber the residues based on the offset.
513
514    If the input sequence to the AlphFold does not start from the first residue,
515    the residue numbering in the predicted structure will be different from
516    the original (UniProt in case of proteins) numbering.
517
518    This class renumbers the residues in the structure based on the provided `offset`.
519
520    > [!NOTE]
521    > Although written with the purpose for AlphFold predicted structures,
522    > this class can be used to renumber residues in any structure file.
523    """
524
525    offset: dict
526    """ Offset describing start and end residue number for each chain in
527    the predicted structure.\n
528    example: `{'A': [1, 100], 'B': [101, 200]}`."""
529
530    def __init__(self, offset: dict = {}):
531
532        self.offset = offset
533
534    def renumber_structure(
535        self,
536        structure: Bio.PDB.Structure.Structure,
537    ):
538        """Renumber the residues in the structure based on the offset.
539
540        Arguments:
541
542        - **structure (Bio.PDB.Structure.Structure)**:<br />
543            Biopython structure object.
544
545        Returns:
546
547        - **structure (Bio.PDB.Structure.Structure)**:<br />
548            Biopython structure object with renumbered residues.
549        """
550
551        for model in structure:
552            for residue in model.get_residues():
553                chain_id = residue.parent.id
554                h, num, ins = residue.id
555
556                num = self.renumber_chain_res_num(
557                    chain_res_num=num,
558                    chain_id=chain_id,
559                )
560
561                residue.id = (h, num, ins)
562
563        return structure
564
565    def original_chain_res_num(
566        self,
567        chain_res_num: int,
568        chain_id: str,
569    ):
570        """Get the original residue number based on the offset.
571
572        Inverse of `renumber_chain_res_num`.
573
574        Arguments:
575
576        - **chain_res_num (int)**:<br />
577            Residue index (1-indexed) within the chain in the predicted structure.
578
579        - **chain_id (str)**:<br />
580            Chain ID of the residue.
581
582        Returns:
583
584        - **chain_res_num (int)**:<br />
585            Original residue number
586        """
587
588        if chain_id in self.offset:
589            chain_res_num -= (self.offset[chain_id][0] - 1)
590
591        return chain_res_num
592
593    def renumber_chain_res_num(
594        self,
595        chain_res_num: int,
596        chain_id: str,
597    ):
598        """Renumber the residue number based on the offset.
599
600        Given a residue index (1-indexed) in a chain, this function renumbers it
601        based on the offset provided for that chain.
602
603        Arguments:
604
605        - **chain_res_num (int)**:<br />
606            Residue index (1-indexed) within the chain in the predicted structure.
607
608        - **chain_id (str)**:<br />
609            Chain ID of the residue.
610
611        Returns:
612
613        - **chain_res_num (int)**:<br />
614            Renumbered residue number
615        """
616
617        if chain_id in self.offset:
618            chain_res_num += self.offset[chain_id][0] - 1
619
620        return chain_res_num
621
622    def renumber_region_of_interest(
623        self,
624        region_of_interest: Dict[str, list],
625    ):
626        """Offset the region of interest to the AF2/3 numbering.
627
628        Region of interest is defined by the user is as per the original numbering
629        (UniProt in case of proteins). \n
630        However, if the prediction is done on a fragment of the protein, the
631        residue numbering in the predicted structure will be different compared to
632        the one provided by the user. \n
633        This function renumbers the region of interest to the numbering of the
634        predicted structure based on the `offset`. \n
635
636        Arguments:
637
638        - **region_of_interest (dict)**:
639            Dictionary containing the region of interest for each chain.
640
641        Returns:
642
643        - **renumbered_region_of_interest (dict)**:
644            Dictionary containing the renumbered region of interest for
645            each chain.
646        """
647
648        renumbered_region_of_interest = {}
649
650        for chain_id in region_of_interest:
651
652            start, end = region_of_interest[chain_id]
653
654            if chain_id in self.offset:
655
656                start = start - (self.offset[chain_id][0] - 1)
657                end = end - (self.offset[chain_id][0] - 1)
658
659            renumbered_region_of_interest[chain_id] = [start, end]
660
661        return renumbered_region_of_interest
662
663    def residue_map(
664        self,
665        token_chain_ids: list,
666        token_res_ids: list,
667        token_atom_names: list,
668        depth: ResidueMapDepth = ResidueMapDepth.ATOM,
669    ):
670        """Create a map of residue indices to residue numbers and vice-versa.
671
672        `res_idx` is essentially the token index. \n
673        `res_num` is the residue number in the chain which will be same as UniProt
674        numbering if the `offset` is provided correctly or full-length sequences
675        were used in the prediction. \n
676        `offset` informs what is the starting residue number for each chain.
677
678        For example, if the input is as follows: \n
679        ```python
680        token_chain_ids = ['A', 'A', 'A', 'B', 'B']
681        token_res_ids = [1, 1, 2, 1, 2]
682        token_atom_names = ['N', 'CA', 'N', 'N', 'CA']
683        ```
684        The output will be: \n
685        ```python
686        idx_to_num = {
687            0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'},
688            1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'},
689            2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'},
690            3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'},
691            4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'},
692        }
693        num_to_idx = {
694            'A': {
695                1: {'N': 0, 'CA': 1},
696                2: {'N': 2},
697            },
698            'B': {
699                1: {'N': 3},
700                2: {'CA': 4},
701            },
702        }
703        ```
704
705        If the offset is provided as: \n
706        ```python
707        offset = {
708            'A': [10, 11],
709            'B': [30, 31],
710        }
711        ```
712        The output will be: \n
713        ```python
714        idx_to_num = {
715            0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'},
716            1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'},
717            2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'},
718            3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'},
719            4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'},
720        }
721        num_to_idx = {
722            'A': {
723                10: {'N': 0, 'CA': 1},
724                11: {'N': 2},
725            },
726            'B': {
727                30: {'N': 3},
728                31: {'CA': 4},
729            },
730        }
731        ```
732
733        Arguments:
734
735        - **token_chain_ids (list)**:<br />
736            Token chain IDs. Same as provided in the AF3 JSON output.
737
738        - **token_res_ids (list)**:<br />
739            Token residue IDs. Same as provided in the AF3 JSON output.
740
741        - **token_atom_names (list)**:<br />
742            Token atom names.
743
744        Returns:
745
746        - **(tuple)**:<br />
747
748            - `idx_to_num (Dict)`:
749                Dictionary mapping token indices to token numbers.
750
751            - `num_to_idx (Dict)`:
752                Dictionary mapping token numbers to token indices.
753        """
754
755        idx_to_num = {}
756        num_to_idx = defaultdict(dict)
757
758        for token_idx, (chain_id, token_num, atom_name) in enumerate(
759            zip(token_chain_ids, token_res_ids, token_atom_names)
760        ):
761
762            token_num = self.renumber_chain_res_num(
763                chain_res_num=token_num,
764                chain_id=chain_id,
765            )
766
767            idx_to_num[token_idx] = {
768                ResidueMapKeys.CHAIN_ID: chain_id,
769                ResidueMapKeys.TOKEN_NUM: token_num,
770            }
771
772            if depth == ResidueMapDepth.ATOM:
773                idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name
774
775            if token_num not in num_to_idx[chain_id]:
776                if depth == ResidueMapDepth.ATOM:
777                    num_to_idx[chain_id][token_num] = {
778                        atom_name: token_idx
779                    }
780                elif depth == ResidueMapDepth.RESIDUE:
781                    num_to_idx[chain_id][token_num] = token_idx
782            else:
783                if depth == ResidueMapDepth.ATOM:
784                    num_to_idx[chain_id][token_num][atom_name] = token_idx
785                elif depth == ResidueMapDepth.RESIDUE:
786                    warnings.warn(
787                        f"""
788
789                        Multiple tokens found for residue number {token_num}
790                        in chain {chain_id}. Overwriting the previous token index.
791                        """
792                    )
793                    num_to_idx[chain_id][token_num] = token_idx
794
795        return idx_to_num, num_to_idx
796
797
798class ResidueSelect(Select):
799    """Class to select residues in the structure based on the input dictionary.
800
801    This is achieved by overriding the `Bio.PDB.Select.accept_residue` method
802    from Biopython.
803    """
804
805    confident_residues: Dict
806    """ Dictionary containing the chain ID as key and a list of residue
807    numbers as value.<br />
808    e.g. `{"A": [1, 2, 3], "B": [4, 5, 6]}`
809    """
810
811    def __init__(
812        self,
813        confident_residues: Dict
814    ):
815        self.confident_residues = confident_residues
816
817    @overload
818    def accept_residue(
819        self,
820        residue: Bio.PDB.Residue.Residue
821    ) -> bool:
822        ...
823
824    @overload
825    def accept_residue(self, residue):
826        """Overload this to reject residues for output."""
827        return 1
828
829    def accept_residue(
830        self,
831        residue: Bio.PDB.Residue.Residue
832    ) -> bool:
833        """Accept the residue if it's in `confident_residues`.
834
835        Arguments:
836
837        - **residue (Bio.PDB.Residue.Residue)**:<br />
838            Biopython residue object.
839
840        Returns:
841
842        - **(bool)**:<br />
843            `True` if the residue is in the `confident_residues`.
844        """
845
846        chain = residue.parent
847        if not isinstance(chain, Chain):
848            raise TypeError(
849                f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}"
850            )
851        chain_id = chain.id
852
853        return residue.id[1] in self.confident_residues.get(chain_id, [])
def get_interaction_map( coords1: numpy.ndarray, coords2: numpy.ndarray, contact_threshold: float, map_type: af_pipeline.constants.af_constants.InteractionMapType) -> numpy.ndarray:
 47def get_interaction_map(
 48    coords1: np.ndarray,
 49    coords2: np.ndarray,
 50    contact_threshold: float,
 51    map_type: InteractionMapType,
 52) -> np.ndarray:
 53    """ Create an interaction map, given the input coordinates.
 54
 55    Returns a distance map or a contact map, based on the map_type specified.
 56
 57    ## Arguments:
 58
 59    - **coords1 (np.ndarray)**:<br />
 60        Coordinates of shape (N, 3) for the first set of atoms/residues.
 61
 62    - **coords2 (np.ndarray)**:<br />
 63        Coordinates of shape (M, 3) for the second set of atoms/residues.
 64
 65    - **contact_threshold (float)**:<br />
 66        Distance threshold to define a contact.
 67        Only used if `map_type` is `InteractionMapType.CONTACT`.
 68
 69    - **map_type (InteractionMapType)**:<br />
 70        Type of interaction map to create.
 71        Can be either `InteractionMapType.DISTANCE` or `InteractionMapType.CONTACT`.
 72
 73         - `InteractionMapType.DISTANCE`:
 74            Returns a distance map of shape (N, M)
 75            where each element (i, j) is the distance between the i-th atom/residue
 76            in `coords1` and the j-th atom/residue in `coords2`.
 77
 78         - `InteractionMapType.CONTACT`:
 79            Returns a binary contact map of shape (N, M)
 80            where each element (i, j) is 1 if the distance between the i-th atom/residue
 81            in `coords1` and the j-th atom/residue in `coords2` is less than or equal to
 82            `contact_threshold`, and 0 otherwise.
 83
 84    ## Returns:
 85
 86    - **np.ndarray**:<br />
 87        The interaction map as a numpy array.
 88    """
 89
 90    if map_type not in VALID_INTERACTION_MAP_TYPES:
 91        raise ValueError(
 92            f"Invalid map_type specified."\
 93            f"Expected one of {VALID_INTERACTION_MAP_TYPES}, got {map_type}"
 94        )
 95
 96    distance_map = get_distance_map(coords1, coords2)
 97
 98    if map_type == InteractionMapType.DISTANCE:
 99        return distance_map
100
101    elif map_type == InteractionMapType.CONTACT:
102        contact_map = get_contact_map(distance_map, contact_threshold)
103        return contact_map

Create an interaction map, given the input coordinates.

Returns a distance map or a contact map, based on the map_type specified.

Arguments:

  • coords1 (np.ndarray):
    Coordinates of shape (N, 3) for the first set of atoms/residues.

  • coords2 (np.ndarray):
    Coordinates of shape (M, 3) for the second set of atoms/residues.

  • contact_threshold (float):
    Distance threshold to define a contact. Only used if map_type is InteractionMapType.CONTACT.

  • map_type (InteractionMapType):
    Type of interaction map to create. Can be either InteractionMapType.DISTANCE or InteractionMapType.CONTACT.

    • InteractionMapType.DISTANCE: Returns a distance map of shape (N, M) where each element (i, j) is the distance between the i-th atom/residue in coords1 and the j-th atom/residue in coords2.

    • InteractionMapType.CONTACT: Returns a binary contact map of shape (N, M) where each element (i, j) is 1 if the distance between the i-th atom/residue in coords1 and the j-th atom/residue in coords2 is less than or equal to contact_threshold, and 0 otherwise.

Returns:

  • np.ndarray:
    The interaction map as a numpy array.
def get_distance_map(coords1: numpy.ndarray, coords2: numpy.ndarray):
105def get_distance_map(coords1: np.ndarray, coords2: np.ndarray):
106    """ Create an all-v-all distance map.
107
108    Returns a matrix of distances between all pairs of atoms/residues in the
109    two sets of coordinates.
110
111    ## Arguments:
112
113    - **coords1 (np.ndarray)**:<br />
114        Coordinates of shape (N, 3) for the first set of atoms/residues.
115
116    - **coords2 (np.ndarray)**:<br />
117        Coordinates of shape (M, 3) for the second set of atoms/residues.
118
119    ## Returns:
120
121    - **np.ndarray**:<br />
122        A matrix of shape (N, M) where each element (i, j) is the Euclidean distance
123        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`.
124    """
125
126    from scipy.spatial import distance_matrix
127
128    distance_map = distance_matrix(coords1, coords2)
129
130    return distance_map

Create an all-v-all distance map.

Returns a matrix of distances between all pairs of atoms/residues in the two sets of coordinates.

Arguments:

  • coords1 (np.ndarray):
    Coordinates of shape (N, 3) for the first set of atoms/residues.

  • coords2 (np.ndarray):
    Coordinates of shape (M, 3) for the second set of atoms/residues.

Returns:

  • np.ndarray:
    A matrix of shape (N, M) where each element (i, j) is the Euclidean distance between the i-th atom/residue in coords1 and the j-th atom/residue in coords2.
def get_contact_map(distance_map: numpy.ndarray, contact_threshold: float):
132def get_contact_map(distance_map: np.ndarray, contact_threshold: float):
133    """ Given the distance map, create a binary contact map by thresholding distances.
134
135    Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact.
136
137    ## Arguments:
138
139    - **distance_map (np.ndarray)**:<br />
140        A matrix of shape (N, M) where each element (i, j) is the Euclidean distance
141        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`.
142
143    - **contact_threshold (float)**:<br />
144        Distance threshold to define a contact. If the distance between two atoms/residues
145        is less than or equal to this threshold, they are considered to be in contact.
146
147    ## Returns:
148
149    - **np.ndarray**:<br />
150        A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance
151        between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`
152        is less than or equal to `contact_threshold`, and 0 otherwise.
153    """
154
155    contact_map = np.where(distance_map <= contact_threshold, 1, 0)
156
157    return contact_map

Given the distance map, create a binary contact map by thresholding distances.

Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact.

Arguments:

  • distance_map (np.ndarray):
    A matrix of shape (N, M) where each element (i, j) is the Euclidean distance between the i-th atom/residue in coords1 and the j-th atom/residue in coords2.

  • contact_threshold (float):
    Distance threshold to define a contact. If the distance between two atoms/residues is less than or equal to this threshold, they are considered to be in contact.

Returns:

  • np.ndarray:
    A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance between the i-th atom/residue in coords1 and the j-th atom/residue in coords2 is less than or equal to contact_threshold, and 0 otherwise.
def get_token_level( residue: Bio.PDB.Residue.Residue) -> af_pipeline.constants.af_constants.TokenLevel:
159def get_token_level(residue: Bio.PDB.Residue.Residue) -> TokenLevel:
160    """ Get the token level of the residue.
161
162    Assuming the `residue` object is decorated with the appropriate attributes.
163    - `tokenLevel`: `TokenLevel.ATOM` if the residue has per-atom token, `TokenLevel.RESIDUE` otherwise.
164
165    Arguments:
166
167    - **residue (Bio.PDB.Residue.Residue)**:<br />
168        Biopython residue object.
169
170    Returns:
171
172    - **token_level (TokenLevel)**:<br />
173        Token level of the residue.
174    """
175
176    token_level = residue.xtra.get(
177        ResidueDecoration.TOKEN_LEVEL, TokenLevel.RESIDUE
178    )
179
180    return token_level

Get the token level of the residue.

Assuming the residue object is decorated with the appropriate attributes.

  • tokenLevel: TokenLevel.ATOM if the residue has per-atom token, TokenLevel.RESIDUE otherwise.

Arguments:

  • residue (Bio.PDB.Residue.Residue):
    Biopython residue object.

Returns:

  • token_level (TokenLevel):
    Token level of the residue.
def has_modifications(structure: Bio.PDB.Structure.Structure) -> bool:
182def has_modifications(structure: Bio.PDB.Structure.Structure) -> bool:
183    """ Check if the structure has any modified residues.
184
185    Assuming the `residue` objects are decorated with the appropriate attributes.
186    - `is_modified`: `True` if the `residue` is modified.
187
188    Arguments:
189
190    - **structure (Bio.PDB.Structure.Structure)**:<br />
191        Biopython structure object.
192
193    Returns:
194
195    - **condition (bool)**:<br />
196        `True` if the structure has any modified residues, `False` otherwise.
197    """
198
199    for residue in structure.get_residues():
200        if residue.xtra.get(ResidueDecoration.IS_MODIFIED, False):
201            return True
202
203    return False

Check if the structure has any modified residues.

Assuming the residue objects are decorated with the appropriate attributes.

  • is_modified: True if the residue is modified.

Arguments:

  • structure (Bio.PDB.Structure.Structure):
    Biopython structure object.

Returns:

  • condition (bool):
    True if the structure has any modified residues, False otherwise.
def save_structure_obj( structure: Bio.PDB.Structure.Structure, out_file: str, res_select_obj: Bio.PDB.PDBIO.Select = <Select all>, save_type: str = <FileFormat.CIF: 'cif'>, preserve_header_footer=False):
205def save_structure_obj(
206    structure: Bio.PDB.Structure.Structure,
207    out_file: str,
208    res_select_obj: Select = Select(),
209    save_type: str = FileFormat.CIF,
210    preserve_header_footer = False,
211):
212    """Save the selection in Biopython structure object as a PDB or CIF file.
213
214    Arguments:
215
216    - **structure (Bio.PDB.Structure.Structure)**:<br />
217        Biopython structure object to save.
218
219    - **out_file (str)**:<br />
220        Path to the output file where the structure will be saved.
221
222    - **res_select_obj (Bio.PDB.Select, optional)**:<br />
223        Biopython Select object to filter the residues to save.
224        Defaults to `Select()` which saves all residues.
225
226    - **save_type (str, optional)**:<br />
227        Type of file to save the structure as.
228        Can be either "pdb" or "cif".
229
230    - **preserve_header_footer (bool, optional)**:<br />
231        If `True`, the header and footer information from the structure
232        will be preserved in the saved file.
233        > [!NOTE]
234        > The header and footer information can only be preserved for CIF files.
235    """
236
237    if save_type == FileFormat.PDB:
238
239        io = PDBIO()
240        io.set_structure(structure)
241        io.save(out_file, res_select_obj)
242
243        if preserve_header_footer:
244            warnings.warn(
245                """
246
247                PDB files do not support headers and footers.
248                Saving without headers and footers.
249                """
250            )
251
252    elif save_type == FileFormat.CIF:
253
254        io = MMCIFIO()
255        io.set_structure(structure)
256        io.save(out_file, res_select_obj)
257
258        if preserve_header_footer:
259
260            header_footer = structure.xtra.get("header_footer", {})
261
262            if {"header", "footer"}.issubset(set(header_footer.keys())):
263
264                with open(out_file, "r") as f:
265                    lines = f.readlines()
266
267                lines.insert(0, "\n")
268                lines.append("\n")
269
270                for header_line in header_footer["header"][::-1]:
271                    lines.insert(0, f"{header_line}")
272
273                for footer_line in header_footer["footer"][::-1]:
274                    lines.append(f"{footer_line}")
275
276                with open(out_file, "w") as f:
277                    for line in lines:
278                        f.write(line)
279
280            else:
281                warnings.warn(
282                    """
283
284                    No header or footer information found in the structure.
285                    Saving without headers and footers.
286                    """
287                )

Save the selection in Biopython structure object as a PDB or CIF file.

Arguments:

  • structure (Bio.PDB.Structure.Structure):
    Biopython structure object to save.

  • out_file (str):
    Path to the output file where the structure will be saved.

  • res_select_obj (Bio.PDB.Select, optional):
    Biopython Select object to filter the residues to save. Defaults to Select() which saves all residues.

  • save_type (str, optional):
    Type of file to save the structure as. Can be either "pdb" or "cif".

  • preserve_header_footer (bool, optional):
    If True, the header and footer information from the structure will be preserved in the saved file.

    Note

    The header and footer information can only be preserved for CIF files.

def decorate_residue( residue: Bio.PDB.Residue.Residue, xtra_field: str | None = None, xtra_value: Any = None):
358def decorate_residue(
359    residue: Bio.PDB.Residue.Residue,
360    xtra_field: str | None = None,
361    xtra_value: Any = None,
362):
363    """Decorate the residue.
364
365    This function adds `entityType` attribute to the residue's `xtra`
366    attribute depending on the type of entity.
367
368    Allowed entity types are:
369    ```python
370    - "proteinChain"
371    - "dnaSequence"
372    - "rnaSequence"
373    - "ligand"
374    - "ion"
375    - None #(if the residue does not belong to any of the above types)
376    ```
377
378    It also adds the following flags to the residue's `xtra` wherever applicable -
379    ```python
380    - "is_modified" # boolean indicating if the residue is modified
381    - "is_ca_only" # boolean indicating if the residue has only CA atom
382    - "is_purine" # boolean indicating if the nucleotide is purine
383    - "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine
384    - "is_ion" # boolean indicating if the residue is an ion.
385    - "is_ligand" # boolean indicating if the residue is a ligand.
386    ```
387
388    Arguments:
389
390    - **residue (Bio.PDB.Residue.Residue)**:<br />
391        Biopython residue object.
392    """
393
394    symbol = residue.get_resname()
395
396    if symbol in PROTEIN_ENTITIES:
397        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.PROTEIN_CHAIN
398
399        if symbol in ALLOWED_PTMS:
400            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
401
402        if symbol in ONLY_CA_RESIDUES:
403            residue.xtra[ResidueDecoration.IS_CA_ONLY] = True
404
405    elif symbol in DNA_ENTITIES:
406        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.DNA_SEQUENCE
407
408        if symbol in ALLOWED_DNA_MODS:
409            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
410    elif symbol in RNA_ENTITIES:
411        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.RNA_SEQUENCE
412
413        if symbol in ALLOWED_RNA_MODS:
414            residue.xtra[ResidueDecoration.IS_MODIFIED] = True
415
416    elif symbol in ALLOWED_LIGANDS:
417        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.LIGAND
418
419    elif symbol in ION:
420        residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.ION
421
422    else:
423        warnings.warn(dedent(f"""
424            The residue "{symbol}" does not belong to any known entity types.
425            Setting 'entityType' to None.""")
426        )
427        residue.xtra[ResidueDecoration.ENTITY_TYPE] = None
428
429    if symbol in PURINES:
430        residue.xtra[ResidueDecoration.IS_PURINE] = True
431
432    elif symbol in PYRIMIDINES:
433        residue.xtra[ResidueDecoration.IS_PYRIMIDINE] = True
434
435    if xtra_field is not None and xtra_value is not None:
436        if xtra_field in residue.xtra:
437            warnings.warn(dedent(f"""
438                The field '{xtra_field}' already exists in the residue's xtra.
439                Overwriting the value.""")
440            )
441        residue.xtra[xtra_field] = xtra_value
442
443    condition = (
444        residue.xtra.get(ResidueDecoration.IS_MODIFIED)
445        or residue.xtra.get(ResidueDecoration.ENTITY_TYPE) in [EntityType.LIGAND, None]
446    )
447    _get_token_level = {True: TokenLevel.ATOM, False: TokenLevel.RESIDUE}
448    residue.xtra[ResidueDecoration.TOKEN_LEVEL] = _get_token_level[condition]

Decorate the residue.

This function adds entityType attribute to the residue's xtra attribute depending on the type of entity.

Allowed entity types are:

- "proteinChain"
- "dnaSequence"
- "rnaSequence"
- "ligand"
- "ion"
- None #(if the residue does not belong to any of the above types)

It also adds the following flags to the residue's xtra wherever applicable -

- "is_modified" # boolean indicating if the residue is modified
- "is_ca_only" # boolean indicating if the residue has only CA atom
- "is_purine" # boolean indicating if the nucleotide is purine
- "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine
- "is_ion" # boolean indicating if the residue is an ion.
- "is_ligand" # boolean indicating if the residue is a ligand.

Arguments:

  • residue (Bio.PDB.Residue.Residue):
    Biopython residue object.
def decorate_atom( atom: Bio.PDB.Atom.Atom, xtra_field: str | None = None, xtra_value: Any = None):
450def decorate_atom(
451    atom: Bio.PDB.Atom.Atom,
452    xtra_field: str | None = None,
453    xtra_value: Any = None,
454):
455    """ Decorate the atom.
456
457    This function adds `is_representative` attribute to the atom's `xtra`
458
459    Arguments:
460
461    - **atom (Bio.PDB.Atom.Atom)**:<br />
462        Biopython atom object.
463
464    - **xtra_field (str | None, optional)**:<br />
465        field to add to the atom's `xtra`.
466
467    - **xtra_value (Any, optional)**:<br />
468        value of the field to add to the atom's `xtra`.
469    """
470
471    symbol = atom.get_name()
472    residue = atom.get_parent()
473
474    if not isinstance(residue, Bio.PDB.Residue.Residue):
475        raise TypeError(
476            f"Expected a `Bio.PDB.Residue.Residue` object, got {type(residue)}"
477        )
478
479    residue = residue.get_resname()
480
481    residue_attrs = (
482        residue in PROTEIN_ENTITIES,
483        residue in ONLY_CA_RESIDUES,
484        residue in PURINES,
485        residue in PYRIMIDINES,
486        symbol
487    )
488
489    atom_xtra_dict = {
490        (True, False, False, False, "CB"): True,
491        (True, True, False, False, "CA"): True,
492        (False, False, True, False, "C4"): True,
493        (False, False, False, True, "C2"): True,
494    }
495
496    atom.xtra[AtomDecoration.IS_REPRESENTATIVE] = atom_xtra_dict.get(
497        residue_attrs, False
498    )
499
500    if (
501        xtra_field is not None
502        and xtra_value is not None
503        and xtra_field in atom.xtra
504    ):
505        warnings.warn(dedent(f"""
506            The field '{xtra_field}' already exists in the atom's xtra.
507            Overwriting the value.""")
508        )
509        atom.xtra[xtra_field] = xtra_value

Decorate the atom.

This function adds is_representative attribute to the atom's xtra

Arguments:

  • atom (Bio.PDB.Atom.Atom):
    Biopython atom object.

  • xtra_field (str | None, optional):
    field to add to the atom's xtra.

  • xtra_value (Any, optional):
    value of the field to add to the atom's xtra.

class RenumberResidues:
512class RenumberResidues:
513    """Class to renumber the residues based on the offset.
514
515    If the input sequence to the AlphFold does not start from the first residue,
516    the residue numbering in the predicted structure will be different from
517    the original (UniProt in case of proteins) numbering.
518
519    This class renumbers the residues in the structure based on the provided `offset`.
520
521    > [!NOTE]
522    > Although written with the purpose for AlphFold predicted structures,
523    > this class can be used to renumber residues in any structure file.
524    """
525
526    offset: dict
527    """ Offset describing start and end residue number for each chain in
528    the predicted structure.\n
529    example: `{'A': [1, 100], 'B': [101, 200]}`."""
530
531    def __init__(self, offset: dict = {}):
532
533        self.offset = offset
534
535    def renumber_structure(
536        self,
537        structure: Bio.PDB.Structure.Structure,
538    ):
539        """Renumber the residues in the structure based on the offset.
540
541        Arguments:
542
543        - **structure (Bio.PDB.Structure.Structure)**:<br />
544            Biopython structure object.
545
546        Returns:
547
548        - **structure (Bio.PDB.Structure.Structure)**:<br />
549            Biopython structure object with renumbered residues.
550        """
551
552        for model in structure:
553            for residue in model.get_residues():
554                chain_id = residue.parent.id
555                h, num, ins = residue.id
556
557                num = self.renumber_chain_res_num(
558                    chain_res_num=num,
559                    chain_id=chain_id,
560                )
561
562                residue.id = (h, num, ins)
563
564        return structure
565
566    def original_chain_res_num(
567        self,
568        chain_res_num: int,
569        chain_id: str,
570    ):
571        """Get the original residue number based on the offset.
572
573        Inverse of `renumber_chain_res_num`.
574
575        Arguments:
576
577        - **chain_res_num (int)**:<br />
578            Residue index (1-indexed) within the chain in the predicted structure.
579
580        - **chain_id (str)**:<br />
581            Chain ID of the residue.
582
583        Returns:
584
585        - **chain_res_num (int)**:<br />
586            Original residue number
587        """
588
589        if chain_id in self.offset:
590            chain_res_num -= (self.offset[chain_id][0] - 1)
591
592        return chain_res_num
593
594    def renumber_chain_res_num(
595        self,
596        chain_res_num: int,
597        chain_id: str,
598    ):
599        """Renumber the residue number based on the offset.
600
601        Given a residue index (1-indexed) in a chain, this function renumbers it
602        based on the offset provided for that chain.
603
604        Arguments:
605
606        - **chain_res_num (int)**:<br />
607            Residue index (1-indexed) within the chain in the predicted structure.
608
609        - **chain_id (str)**:<br />
610            Chain ID of the residue.
611
612        Returns:
613
614        - **chain_res_num (int)**:<br />
615            Renumbered residue number
616        """
617
618        if chain_id in self.offset:
619            chain_res_num += self.offset[chain_id][0] - 1
620
621        return chain_res_num
622
623    def renumber_region_of_interest(
624        self,
625        region_of_interest: Dict[str, list],
626    ):
627        """Offset the region of interest to the AF2/3 numbering.
628
629        Region of interest is defined by the user is as per the original numbering
630        (UniProt in case of proteins). \n
631        However, if the prediction is done on a fragment of the protein, the
632        residue numbering in the predicted structure will be different compared to
633        the one provided by the user. \n
634        This function renumbers the region of interest to the numbering of the
635        predicted structure based on the `offset`. \n
636
637        Arguments:
638
639        - **region_of_interest (dict)**:
640            Dictionary containing the region of interest for each chain.
641
642        Returns:
643
644        - **renumbered_region_of_interest (dict)**:
645            Dictionary containing the renumbered region of interest for
646            each chain.
647        """
648
649        renumbered_region_of_interest = {}
650
651        for chain_id in region_of_interest:
652
653            start, end = region_of_interest[chain_id]
654
655            if chain_id in self.offset:
656
657                start = start - (self.offset[chain_id][0] - 1)
658                end = end - (self.offset[chain_id][0] - 1)
659
660            renumbered_region_of_interest[chain_id] = [start, end]
661
662        return renumbered_region_of_interest
663
664    def residue_map(
665        self,
666        token_chain_ids: list,
667        token_res_ids: list,
668        token_atom_names: list,
669        depth: ResidueMapDepth = ResidueMapDepth.ATOM,
670    ):
671        """Create a map of residue indices to residue numbers and vice-versa.
672
673        `res_idx` is essentially the token index. \n
674        `res_num` is the residue number in the chain which will be same as UniProt
675        numbering if the `offset` is provided correctly or full-length sequences
676        were used in the prediction. \n
677        `offset` informs what is the starting residue number for each chain.
678
679        For example, if the input is as follows: \n
680        ```python
681        token_chain_ids = ['A', 'A', 'A', 'B', 'B']
682        token_res_ids = [1, 1, 2, 1, 2]
683        token_atom_names = ['N', 'CA', 'N', 'N', 'CA']
684        ```
685        The output will be: \n
686        ```python
687        idx_to_num = {
688            0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'},
689            1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'},
690            2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'},
691            3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'},
692            4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'},
693        }
694        num_to_idx = {
695            'A': {
696                1: {'N': 0, 'CA': 1},
697                2: {'N': 2},
698            },
699            'B': {
700                1: {'N': 3},
701                2: {'CA': 4},
702            },
703        }
704        ```
705
706        If the offset is provided as: \n
707        ```python
708        offset = {
709            'A': [10, 11],
710            'B': [30, 31],
711        }
712        ```
713        The output will be: \n
714        ```python
715        idx_to_num = {
716            0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'},
717            1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'},
718            2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'},
719            3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'},
720            4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'},
721        }
722        num_to_idx = {
723            'A': {
724                10: {'N': 0, 'CA': 1},
725                11: {'N': 2},
726            },
727            'B': {
728                30: {'N': 3},
729                31: {'CA': 4},
730            },
731        }
732        ```
733
734        Arguments:
735
736        - **token_chain_ids (list)**:<br />
737            Token chain IDs. Same as provided in the AF3 JSON output.
738
739        - **token_res_ids (list)**:<br />
740            Token residue IDs. Same as provided in the AF3 JSON output.
741
742        - **token_atom_names (list)**:<br />
743            Token atom names.
744
745        Returns:
746
747        - **(tuple)**:<br />
748
749            - `idx_to_num (Dict)`:
750                Dictionary mapping token indices to token numbers.
751
752            - `num_to_idx (Dict)`:
753                Dictionary mapping token numbers to token indices.
754        """
755
756        idx_to_num = {}
757        num_to_idx = defaultdict(dict)
758
759        for token_idx, (chain_id, token_num, atom_name) in enumerate(
760            zip(token_chain_ids, token_res_ids, token_atom_names)
761        ):
762
763            token_num = self.renumber_chain_res_num(
764                chain_res_num=token_num,
765                chain_id=chain_id,
766            )
767
768            idx_to_num[token_idx] = {
769                ResidueMapKeys.CHAIN_ID: chain_id,
770                ResidueMapKeys.TOKEN_NUM: token_num,
771            }
772
773            if depth == ResidueMapDepth.ATOM:
774                idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name
775
776            if token_num not in num_to_idx[chain_id]:
777                if depth == ResidueMapDepth.ATOM:
778                    num_to_idx[chain_id][token_num] = {
779                        atom_name: token_idx
780                    }
781                elif depth == ResidueMapDepth.RESIDUE:
782                    num_to_idx[chain_id][token_num] = token_idx
783            else:
784                if depth == ResidueMapDepth.ATOM:
785                    num_to_idx[chain_id][token_num][atom_name] = token_idx
786                elif depth == ResidueMapDepth.RESIDUE:
787                    warnings.warn(
788                        f"""
789
790                        Multiple tokens found for residue number {token_num}
791                        in chain {chain_id}. Overwriting the previous token index.
792                        """
793                    )
794                    num_to_idx[chain_id][token_num] = token_idx
795
796        return idx_to_num, num_to_idx

Class to renumber the residues based on the offset.

If the input sequence to the AlphFold does not start from the first residue, the residue numbering in the predicted structure will be different from the original (UniProt in case of proteins) numbering.

This class renumbers the residues in the structure based on the provided offset.

Note

Although written with the purpose for AlphFold predicted structures, this class can be used to renumber residues in any structure file.

RenumberResidues(offset: dict = {})
531    def __init__(self, offset: dict = {}):
532
533        self.offset = offset
offset: dict

Offset describing start and end residue number for each chain in the predicted structure.

example: {'A': [1, 100], 'B': [101, 200]}.

def renumber_structure(self, structure: Bio.PDB.Structure.Structure):
535    def renumber_structure(
536        self,
537        structure: Bio.PDB.Structure.Structure,
538    ):
539        """Renumber the residues in the structure based on the offset.
540
541        Arguments:
542
543        - **structure (Bio.PDB.Structure.Structure)**:<br />
544            Biopython structure object.
545
546        Returns:
547
548        - **structure (Bio.PDB.Structure.Structure)**:<br />
549            Biopython structure object with renumbered residues.
550        """
551
552        for model in structure:
553            for residue in model.get_residues():
554                chain_id = residue.parent.id
555                h, num, ins = residue.id
556
557                num = self.renumber_chain_res_num(
558                    chain_res_num=num,
559                    chain_id=chain_id,
560                )
561
562                residue.id = (h, num, ins)
563
564        return structure

Renumber the residues in the structure based on the offset.

Arguments:

  • structure (Bio.PDB.Structure.Structure):
    Biopython structure object.

Returns:

  • structure (Bio.PDB.Structure.Structure):
    Biopython structure object with renumbered residues.
def original_chain_res_num(self, chain_res_num: int, chain_id: str):
566    def original_chain_res_num(
567        self,
568        chain_res_num: int,
569        chain_id: str,
570    ):
571        """Get the original residue number based on the offset.
572
573        Inverse of `renumber_chain_res_num`.
574
575        Arguments:
576
577        - **chain_res_num (int)**:<br />
578            Residue index (1-indexed) within the chain in the predicted structure.
579
580        - **chain_id (str)**:<br />
581            Chain ID of the residue.
582
583        Returns:
584
585        - **chain_res_num (int)**:<br />
586            Original residue number
587        """
588
589        if chain_id in self.offset:
590            chain_res_num -= (self.offset[chain_id][0] - 1)
591
592        return chain_res_num

Get the original residue number based on the offset.

Inverse of renumber_chain_res_num.

Arguments:

  • chain_res_num (int):
    Residue index (1-indexed) within the chain in the predicted structure.

  • chain_id (str):
    Chain ID of the residue.

Returns:

  • chain_res_num (int):
    Original residue number
def renumber_chain_res_num(self, chain_res_num: int, chain_id: str):
594    def renumber_chain_res_num(
595        self,
596        chain_res_num: int,
597        chain_id: str,
598    ):
599        """Renumber the residue number based on the offset.
600
601        Given a residue index (1-indexed) in a chain, this function renumbers it
602        based on the offset provided for that chain.
603
604        Arguments:
605
606        - **chain_res_num (int)**:<br />
607            Residue index (1-indexed) within the chain in the predicted structure.
608
609        - **chain_id (str)**:<br />
610            Chain ID of the residue.
611
612        Returns:
613
614        - **chain_res_num (int)**:<br />
615            Renumbered residue number
616        """
617
618        if chain_id in self.offset:
619            chain_res_num += self.offset[chain_id][0] - 1
620
621        return chain_res_num

Renumber the residue number based on the offset.

Given a residue index (1-indexed) in a chain, this function renumbers it based on the offset provided for that chain.

Arguments:

  • chain_res_num (int):
    Residue index (1-indexed) within the chain in the predicted structure.

  • chain_id (str):
    Chain ID of the residue.

Returns:

  • chain_res_num (int):
    Renumbered residue number
def renumber_region_of_interest(self, region_of_interest: Dict[str, list]):
623    def renumber_region_of_interest(
624        self,
625        region_of_interest: Dict[str, list],
626    ):
627        """Offset the region of interest to the AF2/3 numbering.
628
629        Region of interest is defined by the user is as per the original numbering
630        (UniProt in case of proteins). \n
631        However, if the prediction is done on a fragment of the protein, the
632        residue numbering in the predicted structure will be different compared to
633        the one provided by the user. \n
634        This function renumbers the region of interest to the numbering of the
635        predicted structure based on the `offset`. \n
636
637        Arguments:
638
639        - **region_of_interest (dict)**:
640            Dictionary containing the region of interest for each chain.
641
642        Returns:
643
644        - **renumbered_region_of_interest (dict)**:
645            Dictionary containing the renumbered region of interest for
646            each chain.
647        """
648
649        renumbered_region_of_interest = {}
650
651        for chain_id in region_of_interest:
652
653            start, end = region_of_interest[chain_id]
654
655            if chain_id in self.offset:
656
657                start = start - (self.offset[chain_id][0] - 1)
658                end = end - (self.offset[chain_id][0] - 1)
659
660            renumbered_region_of_interest[chain_id] = [start, end]
661
662        return renumbered_region_of_interest

Offset the region of interest to the AF2/3 numbering.

Region of interest is defined by the user is as per the original numbering (UniProt in case of proteins).

However, if the prediction is done on a fragment of the protein, the residue numbering in the predicted structure will be different compared to the one provided by the user.

This function renumbers the region of interest to the numbering of the predicted structure based on the offset.

Arguments:

  • region_of_interest (dict): Dictionary containing the region of interest for each chain.

Returns:

  • renumbered_region_of_interest (dict): Dictionary containing the renumbered region of interest for each chain.
def residue_map( self, token_chain_ids: list, token_res_ids: list, token_atom_names: list, depth: af_pipeline.constants.af_constants.ResidueMapDepth = <ResidueMapDepth.ATOM: 'atom'>):
664    def residue_map(
665        self,
666        token_chain_ids: list,
667        token_res_ids: list,
668        token_atom_names: list,
669        depth: ResidueMapDepth = ResidueMapDepth.ATOM,
670    ):
671        """Create a map of residue indices to residue numbers and vice-versa.
672
673        `res_idx` is essentially the token index. \n
674        `res_num` is the residue number in the chain which will be same as UniProt
675        numbering if the `offset` is provided correctly or full-length sequences
676        were used in the prediction. \n
677        `offset` informs what is the starting residue number for each chain.
678
679        For example, if the input is as follows: \n
680        ```python
681        token_chain_ids = ['A', 'A', 'A', 'B', 'B']
682        token_res_ids = [1, 1, 2, 1, 2]
683        token_atom_names = ['N', 'CA', 'N', 'N', 'CA']
684        ```
685        The output will be: \n
686        ```python
687        idx_to_num = {
688            0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'},
689            1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'},
690            2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'},
691            3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'},
692            4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'},
693        }
694        num_to_idx = {
695            'A': {
696                1: {'N': 0, 'CA': 1},
697                2: {'N': 2},
698            },
699            'B': {
700                1: {'N': 3},
701                2: {'CA': 4},
702            },
703        }
704        ```
705
706        If the offset is provided as: \n
707        ```python
708        offset = {
709            'A': [10, 11],
710            'B': [30, 31],
711        }
712        ```
713        The output will be: \n
714        ```python
715        idx_to_num = {
716            0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'},
717            1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'},
718            2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'},
719            3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'},
720            4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'},
721        }
722        num_to_idx = {
723            'A': {
724                10: {'N': 0, 'CA': 1},
725                11: {'N': 2},
726            },
727            'B': {
728                30: {'N': 3},
729                31: {'CA': 4},
730            },
731        }
732        ```
733
734        Arguments:
735
736        - **token_chain_ids (list)**:<br />
737            Token chain IDs. Same as provided in the AF3 JSON output.
738
739        - **token_res_ids (list)**:<br />
740            Token residue IDs. Same as provided in the AF3 JSON output.
741
742        - **token_atom_names (list)**:<br />
743            Token atom names.
744
745        Returns:
746
747        - **(tuple)**:<br />
748
749            - `idx_to_num (Dict)`:
750                Dictionary mapping token indices to token numbers.
751
752            - `num_to_idx (Dict)`:
753                Dictionary mapping token numbers to token indices.
754        """
755
756        idx_to_num = {}
757        num_to_idx = defaultdict(dict)
758
759        for token_idx, (chain_id, token_num, atom_name) in enumerate(
760            zip(token_chain_ids, token_res_ids, token_atom_names)
761        ):
762
763            token_num = self.renumber_chain_res_num(
764                chain_res_num=token_num,
765                chain_id=chain_id,
766            )
767
768            idx_to_num[token_idx] = {
769                ResidueMapKeys.CHAIN_ID: chain_id,
770                ResidueMapKeys.TOKEN_NUM: token_num,
771            }
772
773            if depth == ResidueMapDepth.ATOM:
774                idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name
775
776            if token_num not in num_to_idx[chain_id]:
777                if depth == ResidueMapDepth.ATOM:
778                    num_to_idx[chain_id][token_num] = {
779                        atom_name: token_idx
780                    }
781                elif depth == ResidueMapDepth.RESIDUE:
782                    num_to_idx[chain_id][token_num] = token_idx
783            else:
784                if depth == ResidueMapDepth.ATOM:
785                    num_to_idx[chain_id][token_num][atom_name] = token_idx
786                elif depth == ResidueMapDepth.RESIDUE:
787                    warnings.warn(
788                        f"""
789
790                        Multiple tokens found for residue number {token_num}
791                        in chain {chain_id}. Overwriting the previous token index.
792                        """
793                    )
794                    num_to_idx[chain_id][token_num] = token_idx
795
796        return idx_to_num, num_to_idx

Create a map of residue indices to residue numbers and vice-versa.

res_idx is essentially the token index.

res_num is the residue number in the chain which will be same as UniProt numbering if the offset is provided correctly or full-length sequences were used in the prediction.

offset informs what is the starting residue number for each chain.

For example, if the input is as follows:

token_chain_ids = ['A', 'A', 'A', 'B', 'B']
token_res_ids = [1, 1, 2, 1, 2]
token_atom_names = ['N', 'CA', 'N', 'N', 'CA']

The output will be:

idx_to_num = {
    0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'},
    1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'},
    2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'},
    3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'},
    4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'},
}
num_to_idx = {
    'A': {
        1: {'N': 0, 'CA': 1},
        2: {'N': 2},
    },
    'B': {
        1: {'N': 3},
        2: {'CA': 4},
    },
}

If the offset is provided as:

offset = {
    'A': [10, 11],
    'B': [30, 31],
}

The output will be:

idx_to_num = {
    0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'},
    1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'},
    2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'},
    3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'},
    4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'},
}
num_to_idx = {
    'A': {
        10: {'N': 0, 'CA': 1},
        11: {'N': 2},
    },
    'B': {
        30: {'N': 3},
        31: {'CA': 4},
    },
}

Arguments:

  • token_chain_ids (list):
    Token chain IDs. Same as provided in the AF3 JSON output.

  • token_res_ids (list):
    Token residue IDs. Same as provided in the AF3 JSON output.

  • token_atom_names (list):
    Token atom names.

Returns:

  • (tuple):

    • idx_to_num (Dict): Dictionary mapping token indices to token numbers.

    • num_to_idx (Dict): Dictionary mapping token numbers to token indices.

class ResidueSelect(Bio.PDB.PDBIO.Select):
799class ResidueSelect(Select):
800    """Class to select residues in the structure based on the input dictionary.
801
802    This is achieved by overriding the `Bio.PDB.Select.accept_residue` method
803    from Biopython.
804    """
805
806    confident_residues: Dict
807    """ Dictionary containing the chain ID as key and a list of residue
808    numbers as value.<br />
809    e.g. `{"A": [1, 2, 3], "B": [4, 5, 6]}`
810    """
811
812    def __init__(
813        self,
814        confident_residues: Dict
815    ):
816        self.confident_residues = confident_residues
817
818    @overload
819    def accept_residue(
820        self,
821        residue: Bio.PDB.Residue.Residue
822    ) -> bool:
823        ...
824
825    @overload
826    def accept_residue(self, residue):
827        """Overload this to reject residues for output."""
828        return 1
829
830    def accept_residue(
831        self,
832        residue: Bio.PDB.Residue.Residue
833    ) -> bool:
834        """Accept the residue if it's in `confident_residues`.
835
836        Arguments:
837
838        - **residue (Bio.PDB.Residue.Residue)**:<br />
839            Biopython residue object.
840
841        Returns:
842
843        - **(bool)**:<br />
844            `True` if the residue is in the `confident_residues`.
845        """
846
847        chain = residue.parent
848        if not isinstance(chain, Chain):
849            raise TypeError(
850                f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}"
851            )
852        chain_id = chain.id
853
854        return residue.id[1] in self.confident_residues.get(chain_id, [])

Class to select residues in the structure based on the input dictionary.

This is achieved by overriding the Bio.PDB.Select.accept_residue method from Biopython.

ResidueSelect(confident_residues: Dict)
812    def __init__(
813        self,
814        confident_residues: Dict
815    ):
816        self.confident_residues = confident_residues
confident_residues: Dict

Dictionary containing the chain ID as key and a list of residue numbers as value.
e.g. {"A": [1, 2, 3], "B": [4, 5, 6]}

def accept_residue(self, residue: Bio.PDB.Residue.Residue) -> bool:
830    def accept_residue(
831        self,
832        residue: Bio.PDB.Residue.Residue
833    ) -> bool:
834        """Accept the residue if it's in `confident_residues`.
835
836        Arguments:
837
838        - **residue (Bio.PDB.Residue.Residue)**:<br />
839            Biopython residue object.
840
841        Returns:
842
843        - **(bool)**:<br />
844            `True` if the residue is in the `confident_residues`.
845        """
846
847        chain = residue.parent
848        if not isinstance(chain, Chain):
849            raise TypeError(
850                f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}"
851            )
852        chain_id = chain.id
853
854        return residue.id[1] in self.confident_residues.get(chain_id, [])

Accept the residue if it's in confident_residues.

Arguments:

  • residue (Bio.PDB.Residue.Residue):
    Biopython residue object.

Returns: