structure_tools
- This module provides utility functions and classes to work with structure files (PDB or CIF).
- Uses
Biopythonfor structure manipulation and parsing. - The structure object refers to
Bio.PDB.Structure.Structure. - The residue object refers to
Bio.PDB.Residue.Residue.
Hence, the "residue" term is used for amino acids, nucleotides, ions, ligands.
1""" 2[structure_tools](https://github.com/isblab/af_pipeline/tree/main/af_pipeline/tools/structure_tools.py) 3================================== 4- This module provides utility functions and classes to work with structure files (PDB or CIF). 5- Uses `Biopython` for structure manipulation and parsing. 6- The structure object refers to `Bio.PDB.Structure.Structure`. 7- The residue object refers to `Bio.PDB.Residue.Residue`.<br /> 8 Hence, the "residue" term is used for amino acids, nucleotides, ions, ligands. 9""" 10from textwrap import dedent 11import warnings 12import Bio 13import Bio.PDB 14import Bio.PDB.Structure 15import Bio.PDB.Residue 16import Bio.PDB.Atom 17import numpy as np 18from Bio.PDB.Chain import Chain 19from Bio.PDB.mmcifio import MMCIFIO 20from Bio.PDB.PDBIO import Select, PDBIO 21from collections import defaultdict 22from typing import Dict, Any, overload 23from af_pipeline.constants.af_constants import ( 24 PROTEIN_ENTITIES, 25 DNA_ENTITIES, 26 RNA_ENTITIES, 27 ION, 28 PURINES, 29 PYRIMIDINES, 30 ALLOWED_PTMS, 31 ALLOWED_DNA_MODS, 32 ALLOWED_RNA_MODS, 33 ALLOWED_LIGANDS, 34 ONLY_CA_RESIDUES, 35 AtomDecoration, 36 EntityType, 37 FileFormat, 38 InteractionMapType, 39 ResidueDecoration, 40 ResidueMapDepth, 41 VALID_INTERACTION_MAP_TYPES, 42 ResidueMapKeys, 43 TokenLevel, 44) 45 46def get_interaction_map( 47 coords1: np.ndarray, 48 coords2: np.ndarray, 49 contact_threshold: float, 50 map_type: InteractionMapType, 51) -> np.ndarray: 52 """ Create an interaction map, given the input coordinates. 53 54 Returns a distance map or a contact map, based on the map_type specified. 55 56 ## Arguments: 57 58 - **coords1 (np.ndarray)**:<br /> 59 Coordinates of shape (N, 3) for the first set of atoms/residues. 60 61 - **coords2 (np.ndarray)**:<br /> 62 Coordinates of shape (M, 3) for the second set of atoms/residues. 63 64 - **contact_threshold (float)**:<br /> 65 Distance threshold to define a contact. 66 Only used if `map_type` is `InteractionMapType.CONTACT`. 67 68 - **map_type (InteractionMapType)**:<br /> 69 Type of interaction map to create. 70 Can be either `InteractionMapType.DISTANCE` or `InteractionMapType.CONTACT`. 71 72 - `InteractionMapType.DISTANCE`: 73 Returns a distance map of shape (N, M) 74 where each element (i, j) is the distance between the i-th atom/residue 75 in `coords1` and the j-th atom/residue in `coords2`. 76 77 - `InteractionMapType.CONTACT`: 78 Returns a binary contact map of shape (N, M) 79 where each element (i, j) is 1 if the distance between the i-th atom/residue 80 in `coords1` and the j-th atom/residue in `coords2` is less than or equal to 81 `contact_threshold`, and 0 otherwise. 82 83 ## Returns: 84 85 - **np.ndarray**:<br /> 86 The interaction map as a numpy array. 87 """ 88 89 if map_type not in VALID_INTERACTION_MAP_TYPES: 90 raise ValueError( 91 f"Invalid map_type specified."\ 92 f"Expected one of {VALID_INTERACTION_MAP_TYPES}, got {map_type}" 93 ) 94 95 distance_map = get_distance_map(coords1, coords2) 96 97 if map_type == InteractionMapType.DISTANCE: 98 return distance_map 99 100 elif map_type == InteractionMapType.CONTACT: 101 contact_map = get_contact_map(distance_map, contact_threshold) 102 return contact_map 103 104def get_distance_map(coords1: np.ndarray, coords2: np.ndarray): 105 """ Create an all-v-all distance map. 106 107 Returns a matrix of distances between all pairs of atoms/residues in the 108 two sets of coordinates. 109 110 ## Arguments: 111 112 - **coords1 (np.ndarray)**:<br /> 113 Coordinates of shape (N, 3) for the first set of atoms/residues. 114 115 - **coords2 (np.ndarray)**:<br /> 116 Coordinates of shape (M, 3) for the second set of atoms/residues. 117 118 ## Returns: 119 120 - **np.ndarray**:<br /> 121 A matrix of shape (N, M) where each element (i, j) is the Euclidean distance 122 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`. 123 """ 124 125 from scipy.spatial import distance_matrix 126 127 distance_map = distance_matrix(coords1, coords2) 128 129 return distance_map 130 131def get_contact_map(distance_map: np.ndarray, contact_threshold: float): 132 """ Given the distance map, create a binary contact map by thresholding distances. 133 134 Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact. 135 136 ## Arguments: 137 138 - **distance_map (np.ndarray)**:<br /> 139 A matrix of shape (N, M) where each element (i, j) is the Euclidean distance 140 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`. 141 142 - **contact_threshold (float)**:<br /> 143 Distance threshold to define a contact. If the distance between two atoms/residues 144 is less than or equal to this threshold, they are considered to be in contact. 145 146 ## Returns: 147 148 - **np.ndarray**:<br /> 149 A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance 150 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2` 151 is less than or equal to `contact_threshold`, and 0 otherwise. 152 """ 153 154 contact_map = np.where(distance_map <= contact_threshold, 1, 0) 155 156 return contact_map 157 158def get_token_level(residue: Bio.PDB.Residue.Residue) -> TokenLevel: 159 """ Get the token level of the residue. 160 161 Assuming the `residue` object is decorated with the appropriate attributes. 162 - `tokenLevel`: `TokenLevel.ATOM` if the residue has per-atom token, `TokenLevel.RESIDUE` otherwise. 163 164 Arguments: 165 166 - **residue (Bio.PDB.Residue.Residue)**:<br /> 167 Biopython residue object. 168 169 Returns: 170 171 - **token_level (TokenLevel)**:<br /> 172 Token level of the residue. 173 """ 174 175 token_level = residue.xtra.get( 176 ResidueDecoration.TOKEN_LEVEL, TokenLevel.RESIDUE 177 ) 178 179 return token_level 180 181def has_modifications(structure: Bio.PDB.Structure.Structure) -> bool: 182 """ Check if the structure has any modified residues. 183 184 Assuming the `residue` objects are decorated with the appropriate attributes. 185 - `is_modified`: `True` if the `residue` is modified. 186 187 Arguments: 188 189 - **structure (Bio.PDB.Structure.Structure)**:<br /> 190 Biopython structure object. 191 192 Returns: 193 194 - **condition (bool)**:<br /> 195 `True` if the structure has any modified residues, `False` otherwise. 196 """ 197 198 for residue in structure.get_residues(): 199 if residue.xtra.get(ResidueDecoration.IS_MODIFIED, False): 200 return True 201 202 return False 203 204def save_structure_obj( 205 structure: Bio.PDB.Structure.Structure, 206 out_file: str, 207 res_select_obj: Select = Select(), 208 save_type: str = FileFormat.CIF, 209 preserve_header_footer = False, 210): 211 """Save the selection in Biopython structure object as a PDB or CIF file. 212 213 Arguments: 214 215 - **structure (Bio.PDB.Structure.Structure)**:<br /> 216 Biopython structure object to save. 217 218 - **out_file (str)**:<br /> 219 Path to the output file where the structure will be saved. 220 221 - **res_select_obj (Bio.PDB.Select, optional)**:<br /> 222 Biopython Select object to filter the residues to save. 223 Defaults to `Select()` which saves all residues. 224 225 - **save_type (str, optional)**:<br /> 226 Type of file to save the structure as. 227 Can be either "pdb" or "cif". 228 229 - **preserve_header_footer (bool, optional)**:<br /> 230 If `True`, the header and footer information from the structure 231 will be preserved in the saved file. 232 > [!NOTE] 233 > The header and footer information can only be preserved for CIF files. 234 """ 235 236 if save_type == FileFormat.PDB: 237 238 io = PDBIO() 239 io.set_structure(structure) 240 io.save(out_file, res_select_obj) 241 242 if preserve_header_footer: 243 warnings.warn( 244 """ 245 246 PDB files do not support headers and footers. 247 Saving without headers and footers. 248 """ 249 ) 250 251 elif save_type == FileFormat.CIF: 252 253 io = MMCIFIO() 254 io.set_structure(structure) 255 io.save(out_file, res_select_obj) 256 257 if preserve_header_footer: 258 259 header_footer = structure.xtra.get("header_footer", {}) 260 261 if {"header", "footer"}.issubset(set(header_footer.keys())): 262 263 with open(out_file, "r") as f: 264 lines = f.readlines() 265 266 lines.insert(0, "\n") 267 lines.append("\n") 268 269 for header_line in header_footer["header"][::-1]: 270 lines.insert(0, f"{header_line}") 271 272 for footer_line in header_footer["footer"][::-1]: 273 lines.append(f"{footer_line}") 274 275 with open(out_file, "w") as f: 276 for line in lines: 277 f.write(line) 278 279 else: 280 warnings.warn( 281 """ 282 283 No header or footer information found in the structure. 284 Saving without headers and footers. 285 """ 286 ) 287 288def add_header_footer( 289 structure: Bio.PDB.Structure.Structure, 290 structure_file_path: str, 291) -> Bio.PDB.Structure.Structure: 292 """Add the header and footer information to the structure object. 293 294 While parsing the `CIF` file using Biopython `MMCIFParser`, 295 the header and footer information information is not retained.<br /> 296 This function extracts the header and footer information from the 297 structure file and adds it to the structure object. \n 298 299 The information is stored in the `xtra` attribute of the structure 300 object under the key `header_footer`. \n 301 302 > [!NOTE] 303 > - `PDB` format does not have header and footer information. 304 > - Using this function leads to parsing of the structure file twice. Once 305 > to get the structure object and once to extract the header and footer information. 306 307 Arguments: 308 309 - **structure (Bio.PDB.Structure.Structure)**:<br /> 310 Biopython structure object. 311 312 - **structure_file_path (str)**:<br /> 313 Path to the structure file. 314 315 Returns: 316 317 - **structure (Bio.PDB.Structure.Structure)**:<br /> 318 Biopython Structure object with `header_footer` added in `xtra`. 319 """ 320 321 with open(structure_file_path, "r") as f: 322 lines = f.readlines() 323 324 header_info = [] 325 header_section = "" 326 327 for line in lines: 328 header_section += line 329 330 if line.startswith("#"): 331 header_info.append(header_section) 332 header_section = "" 333 334 if line.startswith("_atom_site"): 335 break 336 337 footer_info = [] 338 footer_section = "" 339 340 for line in lines[::-1]: 341 footer_section = line + footer_section 342 343 if line.startswith("#"): 344 footer_info.append(footer_section) 345 footer_section = "" 346 347 if line.startswith("ATOM") or line.startswith("HETATM"): 348 break 349 350 structure.xtra["header_footer"] = { 351 "header": header_info, 352 "footer": footer_info, 353 } 354 355 return structure 356 357def decorate_residue( 358 residue: Bio.PDB.Residue.Residue, 359 xtra_field: str | None = None, 360 xtra_value: Any = None, 361): 362 """Decorate the residue. 363 364 This function adds `entityType` attribute to the residue's `xtra` 365 attribute depending on the type of entity. 366 367 Allowed entity types are: 368 ```python 369 - "proteinChain" 370 - "dnaSequence" 371 - "rnaSequence" 372 - "ligand" 373 - "ion" 374 - None #(if the residue does not belong to any of the above types) 375 ``` 376 377 It also adds the following flags to the residue's `xtra` wherever applicable - 378 ```python 379 - "is_modified" # boolean indicating if the residue is modified 380 - "is_ca_only" # boolean indicating if the residue has only CA atom 381 - "is_purine" # boolean indicating if the nucleotide is purine 382 - "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine 383 - "is_ion" # boolean indicating if the residue is an ion. 384 - "is_ligand" # boolean indicating if the residue is a ligand. 385 ``` 386 387 Arguments: 388 389 - **residue (Bio.PDB.Residue.Residue)**:<br /> 390 Biopython residue object. 391 """ 392 393 symbol = residue.get_resname() 394 395 if symbol in PROTEIN_ENTITIES: 396 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.PROTEIN_CHAIN 397 398 if symbol in ALLOWED_PTMS: 399 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 400 401 if symbol in ONLY_CA_RESIDUES: 402 residue.xtra[ResidueDecoration.IS_CA_ONLY] = True 403 404 elif symbol in DNA_ENTITIES: 405 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.DNA_SEQUENCE 406 407 if symbol in ALLOWED_DNA_MODS: 408 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 409 elif symbol in RNA_ENTITIES: 410 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.RNA_SEQUENCE 411 412 if symbol in ALLOWED_RNA_MODS: 413 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 414 415 elif symbol in ALLOWED_LIGANDS: 416 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.LIGAND 417 418 elif symbol in ION: 419 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.ION 420 421 else: 422 warnings.warn(dedent(f""" 423 The residue "{symbol}" does not belong to any known entity types. 424 Setting 'entityType' to None.""") 425 ) 426 residue.xtra[ResidueDecoration.ENTITY_TYPE] = None 427 428 if symbol in PURINES: 429 residue.xtra[ResidueDecoration.IS_PURINE] = True 430 431 elif symbol in PYRIMIDINES: 432 residue.xtra[ResidueDecoration.IS_PYRIMIDINE] = True 433 434 if xtra_field is not None and xtra_value is not None: 435 if xtra_field in residue.xtra: 436 warnings.warn(dedent(f""" 437 The field '{xtra_field}' already exists in the residue's xtra. 438 Overwriting the value.""") 439 ) 440 residue.xtra[xtra_field] = xtra_value 441 442 condition = ( 443 residue.xtra.get(ResidueDecoration.IS_MODIFIED) 444 or residue.xtra.get(ResidueDecoration.ENTITY_TYPE) in [EntityType.LIGAND, None] 445 ) 446 _get_token_level = {True: TokenLevel.ATOM, False: TokenLevel.RESIDUE} 447 residue.xtra[ResidueDecoration.TOKEN_LEVEL] = _get_token_level[condition] 448 449def decorate_atom( 450 atom: Bio.PDB.Atom.Atom, 451 xtra_field: str | None = None, 452 xtra_value: Any = None, 453): 454 """ Decorate the atom. 455 456 This function adds `is_representative` attribute to the atom's `xtra` 457 458 Arguments: 459 460 - **atom (Bio.PDB.Atom.Atom)**:<br /> 461 Biopython atom object. 462 463 - **xtra_field (str | None, optional)**:<br /> 464 field to add to the atom's `xtra`. 465 466 - **xtra_value (Any, optional)**:<br /> 467 value of the field to add to the atom's `xtra`. 468 """ 469 470 symbol = atom.get_name() 471 residue = atom.get_parent() 472 473 if not isinstance(residue, Bio.PDB.Residue.Residue): 474 raise TypeError( 475 f"Expected a `Bio.PDB.Residue.Residue` object, got {type(residue)}" 476 ) 477 478 residue = residue.get_resname() 479 480 residue_attrs = ( 481 residue in PROTEIN_ENTITIES, 482 residue in ONLY_CA_RESIDUES, 483 residue in PURINES, 484 residue in PYRIMIDINES, 485 symbol 486 ) 487 488 atom_xtra_dict = { 489 (True, False, False, False, "CB"): True, 490 (True, True, False, False, "CA"): True, 491 (False, False, True, False, "C4"): True, 492 (False, False, False, True, "C2"): True, 493 } 494 495 atom.xtra[AtomDecoration.IS_REPRESENTATIVE] = atom_xtra_dict.get( 496 residue_attrs, False 497 ) 498 499 if ( 500 xtra_field is not None 501 and xtra_value is not None 502 and xtra_field in atom.xtra 503 ): 504 warnings.warn(dedent(f""" 505 The field '{xtra_field}' already exists in the atom's xtra. 506 Overwriting the value.""") 507 ) 508 atom.xtra[xtra_field] = xtra_value 509 510 511class RenumberResidues: 512 """Class to renumber the residues based on the offset. 513 514 If the input sequence to the AlphFold does not start from the first residue, 515 the residue numbering in the predicted structure will be different from 516 the original (UniProt in case of proteins) numbering. 517 518 This class renumbers the residues in the structure based on the provided `offset`. 519 520 > [!NOTE] 521 > Although written with the purpose for AlphFold predicted structures, 522 > this class can be used to renumber residues in any structure file. 523 """ 524 525 offset: dict 526 """ Offset describing start and end residue number for each chain in 527 the predicted structure.\n 528 example: `{'A': [1, 100], 'B': [101, 200]}`.""" 529 530 def __init__(self, offset: dict = {}): 531 532 self.offset = offset 533 534 def renumber_structure( 535 self, 536 structure: Bio.PDB.Structure.Structure, 537 ): 538 """Renumber the residues in the structure based on the offset. 539 540 Arguments: 541 542 - **structure (Bio.PDB.Structure.Structure)**:<br /> 543 Biopython structure object. 544 545 Returns: 546 547 - **structure (Bio.PDB.Structure.Structure)**:<br /> 548 Biopython structure object with renumbered residues. 549 """ 550 551 for model in structure: 552 for residue in model.get_residues(): 553 chain_id = residue.parent.id 554 h, num, ins = residue.id 555 556 num = self.renumber_chain_res_num( 557 chain_res_num=num, 558 chain_id=chain_id, 559 ) 560 561 residue.id = (h, num, ins) 562 563 return structure 564 565 def original_chain_res_num( 566 self, 567 chain_res_num: int, 568 chain_id: str, 569 ): 570 """Get the original residue number based on the offset. 571 572 Inverse of `renumber_chain_res_num`. 573 574 Arguments: 575 576 - **chain_res_num (int)**:<br /> 577 Residue index (1-indexed) within the chain in the predicted structure. 578 579 - **chain_id (str)**:<br /> 580 Chain ID of the residue. 581 582 Returns: 583 584 - **chain_res_num (int)**:<br /> 585 Original residue number 586 """ 587 588 if chain_id in self.offset: 589 chain_res_num -= (self.offset[chain_id][0] - 1) 590 591 return chain_res_num 592 593 def renumber_chain_res_num( 594 self, 595 chain_res_num: int, 596 chain_id: str, 597 ): 598 """Renumber the residue number based on the offset. 599 600 Given a residue index (1-indexed) in a chain, this function renumbers it 601 based on the offset provided for that chain. 602 603 Arguments: 604 605 - **chain_res_num (int)**:<br /> 606 Residue index (1-indexed) within the chain in the predicted structure. 607 608 - **chain_id (str)**:<br /> 609 Chain ID of the residue. 610 611 Returns: 612 613 - **chain_res_num (int)**:<br /> 614 Renumbered residue number 615 """ 616 617 if chain_id in self.offset: 618 chain_res_num += self.offset[chain_id][0] - 1 619 620 return chain_res_num 621 622 def renumber_region_of_interest( 623 self, 624 region_of_interest: Dict[str, list], 625 ): 626 """Offset the region of interest to the AF2/3 numbering. 627 628 Region of interest is defined by the user is as per the original numbering 629 (UniProt in case of proteins). \n 630 However, if the prediction is done on a fragment of the protein, the 631 residue numbering in the predicted structure will be different compared to 632 the one provided by the user. \n 633 This function renumbers the region of interest to the numbering of the 634 predicted structure based on the `offset`. \n 635 636 Arguments: 637 638 - **region_of_interest (dict)**: 639 Dictionary containing the region of interest for each chain. 640 641 Returns: 642 643 - **renumbered_region_of_interest (dict)**: 644 Dictionary containing the renumbered region of interest for 645 each chain. 646 """ 647 648 renumbered_region_of_interest = {} 649 650 for chain_id in region_of_interest: 651 652 start, end = region_of_interest[chain_id] 653 654 if chain_id in self.offset: 655 656 start = start - (self.offset[chain_id][0] - 1) 657 end = end - (self.offset[chain_id][0] - 1) 658 659 renumbered_region_of_interest[chain_id] = [start, end] 660 661 return renumbered_region_of_interest 662 663 def residue_map( 664 self, 665 token_chain_ids: list, 666 token_res_ids: list, 667 token_atom_names: list, 668 depth: ResidueMapDepth = ResidueMapDepth.ATOM, 669 ): 670 """Create a map of residue indices to residue numbers and vice-versa. 671 672 `res_idx` is essentially the token index. \n 673 `res_num` is the residue number in the chain which will be same as UniProt 674 numbering if the `offset` is provided correctly or full-length sequences 675 were used in the prediction. \n 676 `offset` informs what is the starting residue number for each chain. 677 678 For example, if the input is as follows: \n 679 ```python 680 token_chain_ids = ['A', 'A', 'A', 'B', 'B'] 681 token_res_ids = [1, 1, 2, 1, 2] 682 token_atom_names = ['N', 'CA', 'N', 'N', 'CA'] 683 ``` 684 The output will be: \n 685 ```python 686 idx_to_num = { 687 0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'}, 688 1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'}, 689 2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'}, 690 3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'}, 691 4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'}, 692 } 693 num_to_idx = { 694 'A': { 695 1: {'N': 0, 'CA': 1}, 696 2: {'N': 2}, 697 }, 698 'B': { 699 1: {'N': 3}, 700 2: {'CA': 4}, 701 }, 702 } 703 ``` 704 705 If the offset is provided as: \n 706 ```python 707 offset = { 708 'A': [10, 11], 709 'B': [30, 31], 710 } 711 ``` 712 The output will be: \n 713 ```python 714 idx_to_num = { 715 0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'}, 716 1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'}, 717 2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'}, 718 3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'}, 719 4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'}, 720 } 721 num_to_idx = { 722 'A': { 723 10: {'N': 0, 'CA': 1}, 724 11: {'N': 2}, 725 }, 726 'B': { 727 30: {'N': 3}, 728 31: {'CA': 4}, 729 }, 730 } 731 ``` 732 733 Arguments: 734 735 - **token_chain_ids (list)**:<br /> 736 Token chain IDs. Same as provided in the AF3 JSON output. 737 738 - **token_res_ids (list)**:<br /> 739 Token residue IDs. Same as provided in the AF3 JSON output. 740 741 - **token_atom_names (list)**:<br /> 742 Token atom names. 743 744 Returns: 745 746 - **(tuple)**:<br /> 747 748 - `idx_to_num (Dict)`: 749 Dictionary mapping token indices to token numbers. 750 751 - `num_to_idx (Dict)`: 752 Dictionary mapping token numbers to token indices. 753 """ 754 755 idx_to_num = {} 756 num_to_idx = defaultdict(dict) 757 758 for token_idx, (chain_id, token_num, atom_name) in enumerate( 759 zip(token_chain_ids, token_res_ids, token_atom_names) 760 ): 761 762 token_num = self.renumber_chain_res_num( 763 chain_res_num=token_num, 764 chain_id=chain_id, 765 ) 766 767 idx_to_num[token_idx] = { 768 ResidueMapKeys.CHAIN_ID: chain_id, 769 ResidueMapKeys.TOKEN_NUM: token_num, 770 } 771 772 if depth == ResidueMapDepth.ATOM: 773 idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name 774 775 if token_num not in num_to_idx[chain_id]: 776 if depth == ResidueMapDepth.ATOM: 777 num_to_idx[chain_id][token_num] = { 778 atom_name: token_idx 779 } 780 elif depth == ResidueMapDepth.RESIDUE: 781 num_to_idx[chain_id][token_num] = token_idx 782 else: 783 if depth == ResidueMapDepth.ATOM: 784 num_to_idx[chain_id][token_num][atom_name] = token_idx 785 elif depth == ResidueMapDepth.RESIDUE: 786 warnings.warn( 787 f""" 788 789 Multiple tokens found for residue number {token_num} 790 in chain {chain_id}. Overwriting the previous token index. 791 """ 792 ) 793 num_to_idx[chain_id][token_num] = token_idx 794 795 return idx_to_num, num_to_idx 796 797 798class ResidueSelect(Select): 799 """Class to select residues in the structure based on the input dictionary. 800 801 This is achieved by overriding the `Bio.PDB.Select.accept_residue` method 802 from Biopython. 803 """ 804 805 confident_residues: Dict 806 """ Dictionary containing the chain ID as key and a list of residue 807 numbers as value.<br /> 808 e.g. `{"A": [1, 2, 3], "B": [4, 5, 6]}` 809 """ 810 811 def __init__( 812 self, 813 confident_residues: Dict 814 ): 815 self.confident_residues = confident_residues 816 817 @overload 818 def accept_residue( 819 self, 820 residue: Bio.PDB.Residue.Residue 821 ) -> bool: 822 ... 823 824 @overload 825 def accept_residue(self, residue): 826 """Overload this to reject residues for output.""" 827 return 1 828 829 def accept_residue( 830 self, 831 residue: Bio.PDB.Residue.Residue 832 ) -> bool: 833 """Accept the residue if it's in `confident_residues`. 834 835 Arguments: 836 837 - **residue (Bio.PDB.Residue.Residue)**:<br /> 838 Biopython residue object. 839 840 Returns: 841 842 - **(bool)**:<br /> 843 `True` if the residue is in the `confident_residues`. 844 """ 845 846 chain = residue.parent 847 if not isinstance(chain, Chain): 848 raise TypeError( 849 f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}" 850 ) 851 chain_id = chain.id 852 853 return residue.id[1] in self.confident_residues.get(chain_id, [])
47def get_interaction_map( 48 coords1: np.ndarray, 49 coords2: np.ndarray, 50 contact_threshold: float, 51 map_type: InteractionMapType, 52) -> np.ndarray: 53 """ Create an interaction map, given the input coordinates. 54 55 Returns a distance map or a contact map, based on the map_type specified. 56 57 ## Arguments: 58 59 - **coords1 (np.ndarray)**:<br /> 60 Coordinates of shape (N, 3) for the first set of atoms/residues. 61 62 - **coords2 (np.ndarray)**:<br /> 63 Coordinates of shape (M, 3) for the second set of atoms/residues. 64 65 - **contact_threshold (float)**:<br /> 66 Distance threshold to define a contact. 67 Only used if `map_type` is `InteractionMapType.CONTACT`. 68 69 - **map_type (InteractionMapType)**:<br /> 70 Type of interaction map to create. 71 Can be either `InteractionMapType.DISTANCE` or `InteractionMapType.CONTACT`. 72 73 - `InteractionMapType.DISTANCE`: 74 Returns a distance map of shape (N, M) 75 where each element (i, j) is the distance between the i-th atom/residue 76 in `coords1` and the j-th atom/residue in `coords2`. 77 78 - `InteractionMapType.CONTACT`: 79 Returns a binary contact map of shape (N, M) 80 where each element (i, j) is 1 if the distance between the i-th atom/residue 81 in `coords1` and the j-th atom/residue in `coords2` is less than or equal to 82 `contact_threshold`, and 0 otherwise. 83 84 ## Returns: 85 86 - **np.ndarray**:<br /> 87 The interaction map as a numpy array. 88 """ 89 90 if map_type not in VALID_INTERACTION_MAP_TYPES: 91 raise ValueError( 92 f"Invalid map_type specified."\ 93 f"Expected one of {VALID_INTERACTION_MAP_TYPES}, got {map_type}" 94 ) 95 96 distance_map = get_distance_map(coords1, coords2) 97 98 if map_type == InteractionMapType.DISTANCE: 99 return distance_map 100 101 elif map_type == InteractionMapType.CONTACT: 102 contact_map = get_contact_map(distance_map, contact_threshold) 103 return contact_map
Create an interaction map, given the input coordinates.
Returns a distance map or a contact map, based on the map_type specified.
Arguments:
coords1 (np.ndarray):
Coordinates of shape (N, 3) for the first set of atoms/residues.coords2 (np.ndarray):
Coordinates of shape (M, 3) for the second set of atoms/residues.contact_threshold (float):
Distance threshold to define a contact. Only used ifmap_typeisInteractionMapType.CONTACT.map_type (InteractionMapType):
Type of interaction map to create. Can be eitherInteractionMapType.DISTANCEorInteractionMapType.CONTACT.InteractionMapType.DISTANCE: Returns a distance map of shape (N, M) where each element (i, j) is the distance between the i-th atom/residue incoords1and the j-th atom/residue incoords2.InteractionMapType.CONTACT: Returns a binary contact map of shape (N, M) where each element (i, j) is 1 if the distance between the i-th atom/residue incoords1and the j-th atom/residue incoords2is less than or equal tocontact_threshold, and 0 otherwise.
Returns:
- np.ndarray:
The interaction map as a numpy array.
105def get_distance_map(coords1: np.ndarray, coords2: np.ndarray): 106 """ Create an all-v-all distance map. 107 108 Returns a matrix of distances between all pairs of atoms/residues in the 109 two sets of coordinates. 110 111 ## Arguments: 112 113 - **coords1 (np.ndarray)**:<br /> 114 Coordinates of shape (N, 3) for the first set of atoms/residues. 115 116 - **coords2 (np.ndarray)**:<br /> 117 Coordinates of shape (M, 3) for the second set of atoms/residues. 118 119 ## Returns: 120 121 - **np.ndarray**:<br /> 122 A matrix of shape (N, M) where each element (i, j) is the Euclidean distance 123 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`. 124 """ 125 126 from scipy.spatial import distance_matrix 127 128 distance_map = distance_matrix(coords1, coords2) 129 130 return distance_map
Create an all-v-all distance map.
Returns a matrix of distances between all pairs of atoms/residues in the two sets of coordinates.
Arguments:
coords1 (np.ndarray):
Coordinates of shape (N, 3) for the first set of atoms/residues.coords2 (np.ndarray):
Coordinates of shape (M, 3) for the second set of atoms/residues.
Returns:
- np.ndarray:
A matrix of shape (N, M) where each element (i, j) is the Euclidean distance between the i-th atom/residue incoords1and the j-th atom/residue incoords2.
132def get_contact_map(distance_map: np.ndarray, contact_threshold: float): 133 """ Given the distance map, create a binary contact map by thresholding distances. 134 135 Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact. 136 137 ## Arguments: 138 139 - **distance_map (np.ndarray)**:<br /> 140 A matrix of shape (N, M) where each element (i, j) is the Euclidean distance 141 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2`. 142 143 - **contact_threshold (float)**:<br /> 144 Distance threshold to define a contact. If the distance between two atoms/residues 145 is less than or equal to this threshold, they are considered to be in contact. 146 147 ## Returns: 148 149 - **np.ndarray**:<br /> 150 A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance 151 between the i-th atom/residue in `coords1` and the j-th atom/residue in `coords2` 152 is less than or equal to `contact_threshold`, and 0 otherwise. 153 """ 154 155 contact_map = np.where(distance_map <= contact_threshold, 1, 0) 156 157 return contact_map
Given the distance map, create a binary contact map by thresholding distances.
Returns a binary matrix, where 1 indicates a contact and 0 indicates no contact.
Arguments:
distance_map (np.ndarray):
A matrix of shape (N, M) where each element (i, j) is the Euclidean distance between the i-th atom/residue incoords1and the j-th atom/residue incoords2.contact_threshold (float):
Distance threshold to define a contact. If the distance between two atoms/residues is less than or equal to this threshold, they are considered to be in contact.
Returns:
- np.ndarray:
A binary matrix of shape (N, M) where each element (i, j) is 1 if the distance between the i-th atom/residue incoords1and the j-th atom/residue incoords2is less than or equal tocontact_threshold, and 0 otherwise.
159def get_token_level(residue: Bio.PDB.Residue.Residue) -> TokenLevel: 160 """ Get the token level of the residue. 161 162 Assuming the `residue` object is decorated with the appropriate attributes. 163 - `tokenLevel`: `TokenLevel.ATOM` if the residue has per-atom token, `TokenLevel.RESIDUE` otherwise. 164 165 Arguments: 166 167 - **residue (Bio.PDB.Residue.Residue)**:<br /> 168 Biopython residue object. 169 170 Returns: 171 172 - **token_level (TokenLevel)**:<br /> 173 Token level of the residue. 174 """ 175 176 token_level = residue.xtra.get( 177 ResidueDecoration.TOKEN_LEVEL, TokenLevel.RESIDUE 178 ) 179 180 return token_level
Get the token level of the residue.
Assuming the residue object is decorated with the appropriate attributes.
tokenLevel:TokenLevel.ATOMif the residue has per-atom token,TokenLevel.RESIDUEotherwise.
Arguments:
- residue (Bio.PDB.Residue.Residue):
Biopython residue object.
Returns:
- token_level (TokenLevel):
Token level of the residue.
182def has_modifications(structure: Bio.PDB.Structure.Structure) -> bool: 183 """ Check if the structure has any modified residues. 184 185 Assuming the `residue` objects are decorated with the appropriate attributes. 186 - `is_modified`: `True` if the `residue` is modified. 187 188 Arguments: 189 190 - **structure (Bio.PDB.Structure.Structure)**:<br /> 191 Biopython structure object. 192 193 Returns: 194 195 - **condition (bool)**:<br /> 196 `True` if the structure has any modified residues, `False` otherwise. 197 """ 198 199 for residue in structure.get_residues(): 200 if residue.xtra.get(ResidueDecoration.IS_MODIFIED, False): 201 return True 202 203 return False
Check if the structure has any modified residues.
Assuming the residue objects are decorated with the appropriate attributes.
is_modified:Trueif theresidueis modified.
Arguments:
- structure (Bio.PDB.Structure.Structure):
Biopython structure object.
Returns:
- condition (bool):
Trueif the structure has any modified residues,Falseotherwise.
205def save_structure_obj( 206 structure: Bio.PDB.Structure.Structure, 207 out_file: str, 208 res_select_obj: Select = Select(), 209 save_type: str = FileFormat.CIF, 210 preserve_header_footer = False, 211): 212 """Save the selection in Biopython structure object as a PDB or CIF file. 213 214 Arguments: 215 216 - **structure (Bio.PDB.Structure.Structure)**:<br /> 217 Biopython structure object to save. 218 219 - **out_file (str)**:<br /> 220 Path to the output file where the structure will be saved. 221 222 - **res_select_obj (Bio.PDB.Select, optional)**:<br /> 223 Biopython Select object to filter the residues to save. 224 Defaults to `Select()` which saves all residues. 225 226 - **save_type (str, optional)**:<br /> 227 Type of file to save the structure as. 228 Can be either "pdb" or "cif". 229 230 - **preserve_header_footer (bool, optional)**:<br /> 231 If `True`, the header and footer information from the structure 232 will be preserved in the saved file. 233 > [!NOTE] 234 > The header and footer information can only be preserved for CIF files. 235 """ 236 237 if save_type == FileFormat.PDB: 238 239 io = PDBIO() 240 io.set_structure(structure) 241 io.save(out_file, res_select_obj) 242 243 if preserve_header_footer: 244 warnings.warn( 245 """ 246 247 PDB files do not support headers and footers. 248 Saving without headers and footers. 249 """ 250 ) 251 252 elif save_type == FileFormat.CIF: 253 254 io = MMCIFIO() 255 io.set_structure(structure) 256 io.save(out_file, res_select_obj) 257 258 if preserve_header_footer: 259 260 header_footer = structure.xtra.get("header_footer", {}) 261 262 if {"header", "footer"}.issubset(set(header_footer.keys())): 263 264 with open(out_file, "r") as f: 265 lines = f.readlines() 266 267 lines.insert(0, "\n") 268 lines.append("\n") 269 270 for header_line in header_footer["header"][::-1]: 271 lines.insert(0, f"{header_line}") 272 273 for footer_line in header_footer["footer"][::-1]: 274 lines.append(f"{footer_line}") 275 276 with open(out_file, "w") as f: 277 for line in lines: 278 f.write(line) 279 280 else: 281 warnings.warn( 282 """ 283 284 No header or footer information found in the structure. 285 Saving without headers and footers. 286 """ 287 )
Save the selection in Biopython structure object as a PDB or CIF file.
Arguments:
structure (Bio.PDB.Structure.Structure):
Biopython structure object to save.out_file (str):
Path to the output file where the structure will be saved.res_select_obj (Bio.PDB.Select, optional):
Biopython Select object to filter the residues to save. Defaults toSelect()which saves all residues.save_type (str, optional):
Type of file to save the structure as. Can be either "pdb" or "cif".preserve_header_footer (bool, optional):
IfTrue, the header and footer information from the structure will be preserved in the saved file.NoteThe header and footer information can only be preserved for CIF files.
358def decorate_residue( 359 residue: Bio.PDB.Residue.Residue, 360 xtra_field: str | None = None, 361 xtra_value: Any = None, 362): 363 """Decorate the residue. 364 365 This function adds `entityType` attribute to the residue's `xtra` 366 attribute depending on the type of entity. 367 368 Allowed entity types are: 369 ```python 370 - "proteinChain" 371 - "dnaSequence" 372 - "rnaSequence" 373 - "ligand" 374 - "ion" 375 - None #(if the residue does not belong to any of the above types) 376 ``` 377 378 It also adds the following flags to the residue's `xtra` wherever applicable - 379 ```python 380 - "is_modified" # boolean indicating if the residue is modified 381 - "is_ca_only" # boolean indicating if the residue has only CA atom 382 - "is_purine" # boolean indicating if the nucleotide is purine 383 - "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine 384 - "is_ion" # boolean indicating if the residue is an ion. 385 - "is_ligand" # boolean indicating if the residue is a ligand. 386 ``` 387 388 Arguments: 389 390 - **residue (Bio.PDB.Residue.Residue)**:<br /> 391 Biopython residue object. 392 """ 393 394 symbol = residue.get_resname() 395 396 if symbol in PROTEIN_ENTITIES: 397 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.PROTEIN_CHAIN 398 399 if symbol in ALLOWED_PTMS: 400 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 401 402 if symbol in ONLY_CA_RESIDUES: 403 residue.xtra[ResidueDecoration.IS_CA_ONLY] = True 404 405 elif symbol in DNA_ENTITIES: 406 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.DNA_SEQUENCE 407 408 if symbol in ALLOWED_DNA_MODS: 409 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 410 elif symbol in RNA_ENTITIES: 411 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.RNA_SEQUENCE 412 413 if symbol in ALLOWED_RNA_MODS: 414 residue.xtra[ResidueDecoration.IS_MODIFIED] = True 415 416 elif symbol in ALLOWED_LIGANDS: 417 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.LIGAND 418 419 elif symbol in ION: 420 residue.xtra[ResidueDecoration.ENTITY_TYPE] = EntityType.ION 421 422 else: 423 warnings.warn(dedent(f""" 424 The residue "{symbol}" does not belong to any known entity types. 425 Setting 'entityType' to None.""") 426 ) 427 residue.xtra[ResidueDecoration.ENTITY_TYPE] = None 428 429 if symbol in PURINES: 430 residue.xtra[ResidueDecoration.IS_PURINE] = True 431 432 elif symbol in PYRIMIDINES: 433 residue.xtra[ResidueDecoration.IS_PYRIMIDINE] = True 434 435 if xtra_field is not None and xtra_value is not None: 436 if xtra_field in residue.xtra: 437 warnings.warn(dedent(f""" 438 The field '{xtra_field}' already exists in the residue's xtra. 439 Overwriting the value.""") 440 ) 441 residue.xtra[xtra_field] = xtra_value 442 443 condition = ( 444 residue.xtra.get(ResidueDecoration.IS_MODIFIED) 445 or residue.xtra.get(ResidueDecoration.ENTITY_TYPE) in [EntityType.LIGAND, None] 446 ) 447 _get_token_level = {True: TokenLevel.ATOM, False: TokenLevel.RESIDUE} 448 residue.xtra[ResidueDecoration.TOKEN_LEVEL] = _get_token_level[condition]
Decorate the residue.
This function adds entityType attribute to the residue's xtra
attribute depending on the type of entity.
Allowed entity types are:
- "proteinChain"
- "dnaSequence"
- "rnaSequence"
- "ligand"
- "ion"
- None #(if the residue does not belong to any of the above types)
It also adds the following flags to the residue's xtra wherever applicable -
- "is_modified" # boolean indicating if the residue is modified
- "is_ca_only" # boolean indicating if the residue has only CA atom
- "is_purine" # boolean indicating if the nucleotide is purine
- "is_pyrimidine" # boolean indicating if the nucleotide is pyrimidine
- "is_ion" # boolean indicating if the residue is an ion.
- "is_ligand" # boolean indicating if the residue is a ligand.
Arguments:
- residue (Bio.PDB.Residue.Residue):
Biopython residue object.
450def decorate_atom( 451 atom: Bio.PDB.Atom.Atom, 452 xtra_field: str | None = None, 453 xtra_value: Any = None, 454): 455 """ Decorate the atom. 456 457 This function adds `is_representative` attribute to the atom's `xtra` 458 459 Arguments: 460 461 - **atom (Bio.PDB.Atom.Atom)**:<br /> 462 Biopython atom object. 463 464 - **xtra_field (str | None, optional)**:<br /> 465 field to add to the atom's `xtra`. 466 467 - **xtra_value (Any, optional)**:<br /> 468 value of the field to add to the atom's `xtra`. 469 """ 470 471 symbol = atom.get_name() 472 residue = atom.get_parent() 473 474 if not isinstance(residue, Bio.PDB.Residue.Residue): 475 raise TypeError( 476 f"Expected a `Bio.PDB.Residue.Residue` object, got {type(residue)}" 477 ) 478 479 residue = residue.get_resname() 480 481 residue_attrs = ( 482 residue in PROTEIN_ENTITIES, 483 residue in ONLY_CA_RESIDUES, 484 residue in PURINES, 485 residue in PYRIMIDINES, 486 symbol 487 ) 488 489 atom_xtra_dict = { 490 (True, False, False, False, "CB"): True, 491 (True, True, False, False, "CA"): True, 492 (False, False, True, False, "C4"): True, 493 (False, False, False, True, "C2"): True, 494 } 495 496 atom.xtra[AtomDecoration.IS_REPRESENTATIVE] = atom_xtra_dict.get( 497 residue_attrs, False 498 ) 499 500 if ( 501 xtra_field is not None 502 and xtra_value is not None 503 and xtra_field in atom.xtra 504 ): 505 warnings.warn(dedent(f""" 506 The field '{xtra_field}' already exists in the atom's xtra. 507 Overwriting the value.""") 508 ) 509 atom.xtra[xtra_field] = xtra_value
Decorate the atom.
This function adds is_representative attribute to the atom's xtra
Arguments:
atom (Bio.PDB.Atom.Atom):
Biopython atom object.xtra_field (str | None, optional):
field to add to the atom'sxtra.xtra_value (Any, optional):
value of the field to add to the atom'sxtra.
512class RenumberResidues: 513 """Class to renumber the residues based on the offset. 514 515 If the input sequence to the AlphFold does not start from the first residue, 516 the residue numbering in the predicted structure will be different from 517 the original (UniProt in case of proteins) numbering. 518 519 This class renumbers the residues in the structure based on the provided `offset`. 520 521 > [!NOTE] 522 > Although written with the purpose for AlphFold predicted structures, 523 > this class can be used to renumber residues in any structure file. 524 """ 525 526 offset: dict 527 """ Offset describing start and end residue number for each chain in 528 the predicted structure.\n 529 example: `{'A': [1, 100], 'B': [101, 200]}`.""" 530 531 def __init__(self, offset: dict = {}): 532 533 self.offset = offset 534 535 def renumber_structure( 536 self, 537 structure: Bio.PDB.Structure.Structure, 538 ): 539 """Renumber the residues in the structure based on the offset. 540 541 Arguments: 542 543 - **structure (Bio.PDB.Structure.Structure)**:<br /> 544 Biopython structure object. 545 546 Returns: 547 548 - **structure (Bio.PDB.Structure.Structure)**:<br /> 549 Biopython structure object with renumbered residues. 550 """ 551 552 for model in structure: 553 for residue in model.get_residues(): 554 chain_id = residue.parent.id 555 h, num, ins = residue.id 556 557 num = self.renumber_chain_res_num( 558 chain_res_num=num, 559 chain_id=chain_id, 560 ) 561 562 residue.id = (h, num, ins) 563 564 return structure 565 566 def original_chain_res_num( 567 self, 568 chain_res_num: int, 569 chain_id: str, 570 ): 571 """Get the original residue number based on the offset. 572 573 Inverse of `renumber_chain_res_num`. 574 575 Arguments: 576 577 - **chain_res_num (int)**:<br /> 578 Residue index (1-indexed) within the chain in the predicted structure. 579 580 - **chain_id (str)**:<br /> 581 Chain ID of the residue. 582 583 Returns: 584 585 - **chain_res_num (int)**:<br /> 586 Original residue number 587 """ 588 589 if chain_id in self.offset: 590 chain_res_num -= (self.offset[chain_id][0] - 1) 591 592 return chain_res_num 593 594 def renumber_chain_res_num( 595 self, 596 chain_res_num: int, 597 chain_id: str, 598 ): 599 """Renumber the residue number based on the offset. 600 601 Given a residue index (1-indexed) in a chain, this function renumbers it 602 based on the offset provided for that chain. 603 604 Arguments: 605 606 - **chain_res_num (int)**:<br /> 607 Residue index (1-indexed) within the chain in the predicted structure. 608 609 - **chain_id (str)**:<br /> 610 Chain ID of the residue. 611 612 Returns: 613 614 - **chain_res_num (int)**:<br /> 615 Renumbered residue number 616 """ 617 618 if chain_id in self.offset: 619 chain_res_num += self.offset[chain_id][0] - 1 620 621 return chain_res_num 622 623 def renumber_region_of_interest( 624 self, 625 region_of_interest: Dict[str, list], 626 ): 627 """Offset the region of interest to the AF2/3 numbering. 628 629 Region of interest is defined by the user is as per the original numbering 630 (UniProt in case of proteins). \n 631 However, if the prediction is done on a fragment of the protein, the 632 residue numbering in the predicted structure will be different compared to 633 the one provided by the user. \n 634 This function renumbers the region of interest to the numbering of the 635 predicted structure based on the `offset`. \n 636 637 Arguments: 638 639 - **region_of_interest (dict)**: 640 Dictionary containing the region of interest for each chain. 641 642 Returns: 643 644 - **renumbered_region_of_interest (dict)**: 645 Dictionary containing the renumbered region of interest for 646 each chain. 647 """ 648 649 renumbered_region_of_interest = {} 650 651 for chain_id in region_of_interest: 652 653 start, end = region_of_interest[chain_id] 654 655 if chain_id in self.offset: 656 657 start = start - (self.offset[chain_id][0] - 1) 658 end = end - (self.offset[chain_id][0] - 1) 659 660 renumbered_region_of_interest[chain_id] = [start, end] 661 662 return renumbered_region_of_interest 663 664 def residue_map( 665 self, 666 token_chain_ids: list, 667 token_res_ids: list, 668 token_atom_names: list, 669 depth: ResidueMapDepth = ResidueMapDepth.ATOM, 670 ): 671 """Create a map of residue indices to residue numbers and vice-versa. 672 673 `res_idx` is essentially the token index. \n 674 `res_num` is the residue number in the chain which will be same as UniProt 675 numbering if the `offset` is provided correctly or full-length sequences 676 were used in the prediction. \n 677 `offset` informs what is the starting residue number for each chain. 678 679 For example, if the input is as follows: \n 680 ```python 681 token_chain_ids = ['A', 'A', 'A', 'B', 'B'] 682 token_res_ids = [1, 1, 2, 1, 2] 683 token_atom_names = ['N', 'CA', 'N', 'N', 'CA'] 684 ``` 685 The output will be: \n 686 ```python 687 idx_to_num = { 688 0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'}, 689 1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'}, 690 2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'}, 691 3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'}, 692 4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'}, 693 } 694 num_to_idx = { 695 'A': { 696 1: {'N': 0, 'CA': 1}, 697 2: {'N': 2}, 698 }, 699 'B': { 700 1: {'N': 3}, 701 2: {'CA': 4}, 702 }, 703 } 704 ``` 705 706 If the offset is provided as: \n 707 ```python 708 offset = { 709 'A': [10, 11], 710 'B': [30, 31], 711 } 712 ``` 713 The output will be: \n 714 ```python 715 idx_to_num = { 716 0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'}, 717 1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'}, 718 2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'}, 719 3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'}, 720 4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'}, 721 } 722 num_to_idx = { 723 'A': { 724 10: {'N': 0, 'CA': 1}, 725 11: {'N': 2}, 726 }, 727 'B': { 728 30: {'N': 3}, 729 31: {'CA': 4}, 730 }, 731 } 732 ``` 733 734 Arguments: 735 736 - **token_chain_ids (list)**:<br /> 737 Token chain IDs. Same as provided in the AF3 JSON output. 738 739 - **token_res_ids (list)**:<br /> 740 Token residue IDs. Same as provided in the AF3 JSON output. 741 742 - **token_atom_names (list)**:<br /> 743 Token atom names. 744 745 Returns: 746 747 - **(tuple)**:<br /> 748 749 - `idx_to_num (Dict)`: 750 Dictionary mapping token indices to token numbers. 751 752 - `num_to_idx (Dict)`: 753 Dictionary mapping token numbers to token indices. 754 """ 755 756 idx_to_num = {} 757 num_to_idx = defaultdict(dict) 758 759 for token_idx, (chain_id, token_num, atom_name) in enumerate( 760 zip(token_chain_ids, token_res_ids, token_atom_names) 761 ): 762 763 token_num = self.renumber_chain_res_num( 764 chain_res_num=token_num, 765 chain_id=chain_id, 766 ) 767 768 idx_to_num[token_idx] = { 769 ResidueMapKeys.CHAIN_ID: chain_id, 770 ResidueMapKeys.TOKEN_NUM: token_num, 771 } 772 773 if depth == ResidueMapDepth.ATOM: 774 idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name 775 776 if token_num not in num_to_idx[chain_id]: 777 if depth == ResidueMapDepth.ATOM: 778 num_to_idx[chain_id][token_num] = { 779 atom_name: token_idx 780 } 781 elif depth == ResidueMapDepth.RESIDUE: 782 num_to_idx[chain_id][token_num] = token_idx 783 else: 784 if depth == ResidueMapDepth.ATOM: 785 num_to_idx[chain_id][token_num][atom_name] = token_idx 786 elif depth == ResidueMapDepth.RESIDUE: 787 warnings.warn( 788 f""" 789 790 Multiple tokens found for residue number {token_num} 791 in chain {chain_id}. Overwriting the previous token index. 792 """ 793 ) 794 num_to_idx[chain_id][token_num] = token_idx 795 796 return idx_to_num, num_to_idx
Class to renumber the residues based on the offset.
If the input sequence to the AlphFold does not start from the first residue, the residue numbering in the predicted structure will be different from the original (UniProt in case of proteins) numbering.
This class renumbers the residues in the structure based on the provided offset.
Although written with the purpose for AlphFold predicted structures, this class can be used to renumber residues in any structure file.
Offset describing start and end residue number for each chain in the predicted structure.
example: {'A': [1, 100], 'B': [101, 200]}.
535 def renumber_structure( 536 self, 537 structure: Bio.PDB.Structure.Structure, 538 ): 539 """Renumber the residues in the structure based on the offset. 540 541 Arguments: 542 543 - **structure (Bio.PDB.Structure.Structure)**:<br /> 544 Biopython structure object. 545 546 Returns: 547 548 - **structure (Bio.PDB.Structure.Structure)**:<br /> 549 Biopython structure object with renumbered residues. 550 """ 551 552 for model in structure: 553 for residue in model.get_residues(): 554 chain_id = residue.parent.id 555 h, num, ins = residue.id 556 557 num = self.renumber_chain_res_num( 558 chain_res_num=num, 559 chain_id=chain_id, 560 ) 561 562 residue.id = (h, num, ins) 563 564 return structure
Renumber the residues in the structure based on the offset.
Arguments:
- structure (Bio.PDB.Structure.Structure):
Biopython structure object.
Returns:
- structure (Bio.PDB.Structure.Structure):
Biopython structure object with renumbered residues.
566 def original_chain_res_num( 567 self, 568 chain_res_num: int, 569 chain_id: str, 570 ): 571 """Get the original residue number based on the offset. 572 573 Inverse of `renumber_chain_res_num`. 574 575 Arguments: 576 577 - **chain_res_num (int)**:<br /> 578 Residue index (1-indexed) within the chain in the predicted structure. 579 580 - **chain_id (str)**:<br /> 581 Chain ID of the residue. 582 583 Returns: 584 585 - **chain_res_num (int)**:<br /> 586 Original residue number 587 """ 588 589 if chain_id in self.offset: 590 chain_res_num -= (self.offset[chain_id][0] - 1) 591 592 return chain_res_num
Get the original residue number based on the offset.
Inverse of renumber_chain_res_num.
Arguments:
chain_res_num (int):
Residue index (1-indexed) within the chain in the predicted structure.chain_id (str):
Chain ID of the residue.
Returns:
- chain_res_num (int):
Original residue number
594 def renumber_chain_res_num( 595 self, 596 chain_res_num: int, 597 chain_id: str, 598 ): 599 """Renumber the residue number based on the offset. 600 601 Given a residue index (1-indexed) in a chain, this function renumbers it 602 based on the offset provided for that chain. 603 604 Arguments: 605 606 - **chain_res_num (int)**:<br /> 607 Residue index (1-indexed) within the chain in the predicted structure. 608 609 - **chain_id (str)**:<br /> 610 Chain ID of the residue. 611 612 Returns: 613 614 - **chain_res_num (int)**:<br /> 615 Renumbered residue number 616 """ 617 618 if chain_id in self.offset: 619 chain_res_num += self.offset[chain_id][0] - 1 620 621 return chain_res_num
Renumber the residue number based on the offset.
Given a residue index (1-indexed) in a chain, this function renumbers it based on the offset provided for that chain.
Arguments:
chain_res_num (int):
Residue index (1-indexed) within the chain in the predicted structure.chain_id (str):
Chain ID of the residue.
Returns:
- chain_res_num (int):
Renumbered residue number
623 def renumber_region_of_interest( 624 self, 625 region_of_interest: Dict[str, list], 626 ): 627 """Offset the region of interest to the AF2/3 numbering. 628 629 Region of interest is defined by the user is as per the original numbering 630 (UniProt in case of proteins). \n 631 However, if the prediction is done on a fragment of the protein, the 632 residue numbering in the predicted structure will be different compared to 633 the one provided by the user. \n 634 This function renumbers the region of interest to the numbering of the 635 predicted structure based on the `offset`. \n 636 637 Arguments: 638 639 - **region_of_interest (dict)**: 640 Dictionary containing the region of interest for each chain. 641 642 Returns: 643 644 - **renumbered_region_of_interest (dict)**: 645 Dictionary containing the renumbered region of interest for 646 each chain. 647 """ 648 649 renumbered_region_of_interest = {} 650 651 for chain_id in region_of_interest: 652 653 start, end = region_of_interest[chain_id] 654 655 if chain_id in self.offset: 656 657 start = start - (self.offset[chain_id][0] - 1) 658 end = end - (self.offset[chain_id][0] - 1) 659 660 renumbered_region_of_interest[chain_id] = [start, end] 661 662 return renumbered_region_of_interest
Offset the region of interest to the AF2/3 numbering.
Region of interest is defined by the user is as per the original numbering (UniProt in case of proteins).
However, if the prediction is done on a fragment of the protein, the residue numbering in the predicted structure will be different compared to the one provided by the user.
This function renumbers the region of interest to the numbering of the
predicted structure based on the offset.
Arguments:
- region_of_interest (dict): Dictionary containing the region of interest for each chain.
Returns:
- renumbered_region_of_interest (dict): Dictionary containing the renumbered region of interest for each chain.
664 def residue_map( 665 self, 666 token_chain_ids: list, 667 token_res_ids: list, 668 token_atom_names: list, 669 depth: ResidueMapDepth = ResidueMapDepth.ATOM, 670 ): 671 """Create a map of residue indices to residue numbers and vice-versa. 672 673 `res_idx` is essentially the token index. \n 674 `res_num` is the residue number in the chain which will be same as UniProt 675 numbering if the `offset` is provided correctly or full-length sequences 676 were used in the prediction. \n 677 `offset` informs what is the starting residue number for each chain. 678 679 For example, if the input is as follows: \n 680 ```python 681 token_chain_ids = ['A', 'A', 'A', 'B', 'B'] 682 token_res_ids = [1, 1, 2, 1, 2] 683 token_atom_names = ['N', 'CA', 'N', 'N', 'CA'] 684 ``` 685 The output will be: \n 686 ```python 687 idx_to_num = { 688 0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'}, 689 1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'}, 690 2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'}, 691 3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'}, 692 4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'}, 693 } 694 num_to_idx = { 695 'A': { 696 1: {'N': 0, 'CA': 1}, 697 2: {'N': 2}, 698 }, 699 'B': { 700 1: {'N': 3}, 701 2: {'CA': 4}, 702 }, 703 } 704 ``` 705 706 If the offset is provided as: \n 707 ```python 708 offset = { 709 'A': [10, 11], 710 'B': [30, 31], 711 } 712 ``` 713 The output will be: \n 714 ```python 715 idx_to_num = { 716 0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'}, 717 1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'}, 718 2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'}, 719 3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'}, 720 4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'}, 721 } 722 num_to_idx = { 723 'A': { 724 10: {'N': 0, 'CA': 1}, 725 11: {'N': 2}, 726 }, 727 'B': { 728 30: {'N': 3}, 729 31: {'CA': 4}, 730 }, 731 } 732 ``` 733 734 Arguments: 735 736 - **token_chain_ids (list)**:<br /> 737 Token chain IDs. Same as provided in the AF3 JSON output. 738 739 - **token_res_ids (list)**:<br /> 740 Token residue IDs. Same as provided in the AF3 JSON output. 741 742 - **token_atom_names (list)**:<br /> 743 Token atom names. 744 745 Returns: 746 747 - **(tuple)**:<br /> 748 749 - `idx_to_num (Dict)`: 750 Dictionary mapping token indices to token numbers. 751 752 - `num_to_idx (Dict)`: 753 Dictionary mapping token numbers to token indices. 754 """ 755 756 idx_to_num = {} 757 num_to_idx = defaultdict(dict) 758 759 for token_idx, (chain_id, token_num, atom_name) in enumerate( 760 zip(token_chain_ids, token_res_ids, token_atom_names) 761 ): 762 763 token_num = self.renumber_chain_res_num( 764 chain_res_num=token_num, 765 chain_id=chain_id, 766 ) 767 768 idx_to_num[token_idx] = { 769 ResidueMapKeys.CHAIN_ID: chain_id, 770 ResidueMapKeys.TOKEN_NUM: token_num, 771 } 772 773 if depth == ResidueMapDepth.ATOM: 774 idx_to_num[token_idx][ResidueMapKeys.ATOM_NAME] = atom_name 775 776 if token_num not in num_to_idx[chain_id]: 777 if depth == ResidueMapDepth.ATOM: 778 num_to_idx[chain_id][token_num] = { 779 atom_name: token_idx 780 } 781 elif depth == ResidueMapDepth.RESIDUE: 782 num_to_idx[chain_id][token_num] = token_idx 783 else: 784 if depth == ResidueMapDepth.ATOM: 785 num_to_idx[chain_id][token_num][atom_name] = token_idx 786 elif depth == ResidueMapDepth.RESIDUE: 787 warnings.warn( 788 f""" 789 790 Multiple tokens found for residue number {token_num} 791 in chain {chain_id}. Overwriting the previous token index. 792 """ 793 ) 794 num_to_idx[chain_id][token_num] = token_idx 795 796 return idx_to_num, num_to_idx
Create a map of residue indices to residue numbers and vice-versa.
res_idx is essentially the token index.
res_num is the residue number in the chain which will be same as UniProt
numbering if the offset is provided correctly or full-length sequences
were used in the prediction.
offset informs what is the starting residue number for each chain.
For example, if the input is as follows:
token_chain_ids = ['A', 'A', 'A', 'B', 'B']
token_res_ids = [1, 1, 2, 1, 2]
token_atom_names = ['N', 'CA', 'N', 'N', 'CA']
The output will be:
idx_to_num = {
0: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'N'},
1: {'chain_id': 'A', 'res_num': 1, 'atom_name': 'CA'},
2: {'chain_id': 'A', 'res_num': 2, 'atom_name': 'N'},
3: {'chain_id': 'B', 'res_num': 1, 'atom_name': 'N'},
4: {'chain_id': 'B', 'res_num': 2, 'atom_name': 'CA'},
}
num_to_idx = {
'A': {
1: {'N': 0, 'CA': 1},
2: {'N': 2},
},
'B': {
1: {'N': 3},
2: {'CA': 4},
},
}
If the offset is provided as:
offset = {
'A': [10, 11],
'B': [30, 31],
}
The output will be:
idx_to_num = {
0: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'N'},
1: {'chain_id': 'A', 'res_num': 10, 'atom_name': 'CA'},
2: {'chain_id': 'A', 'res_num': 11, 'atom_name': 'N'},
3: {'chain_id': 'B', 'res_num': 30, 'atom_name': 'N'},
4: {'chain_id': 'B', 'res_num': 31, 'atom_name': 'CA'},
}
num_to_idx = {
'A': {
10: {'N': 0, 'CA': 1},
11: {'N': 2},
},
'B': {
30: {'N': 3},
31: {'CA': 4},
},
}
Arguments:
token_chain_ids (list):
Token chain IDs. Same as provided in the AF3 JSON output.token_res_ids (list):
Token residue IDs. Same as provided in the AF3 JSON output.token_atom_names (list):
Token atom names.
Returns:
(tuple):
idx_to_num (Dict): Dictionary mapping token indices to token numbers.num_to_idx (Dict): Dictionary mapping token numbers to token indices.
799class ResidueSelect(Select): 800 """Class to select residues in the structure based on the input dictionary. 801 802 This is achieved by overriding the `Bio.PDB.Select.accept_residue` method 803 from Biopython. 804 """ 805 806 confident_residues: Dict 807 """ Dictionary containing the chain ID as key and a list of residue 808 numbers as value.<br /> 809 e.g. `{"A": [1, 2, 3], "B": [4, 5, 6]}` 810 """ 811 812 def __init__( 813 self, 814 confident_residues: Dict 815 ): 816 self.confident_residues = confident_residues 817 818 @overload 819 def accept_residue( 820 self, 821 residue: Bio.PDB.Residue.Residue 822 ) -> bool: 823 ... 824 825 @overload 826 def accept_residue(self, residue): 827 """Overload this to reject residues for output.""" 828 return 1 829 830 def accept_residue( 831 self, 832 residue: Bio.PDB.Residue.Residue 833 ) -> bool: 834 """Accept the residue if it's in `confident_residues`. 835 836 Arguments: 837 838 - **residue (Bio.PDB.Residue.Residue)**:<br /> 839 Biopython residue object. 840 841 Returns: 842 843 - **(bool)**:<br /> 844 `True` if the residue is in the `confident_residues`. 845 """ 846 847 chain = residue.parent 848 if not isinstance(chain, Chain): 849 raise TypeError( 850 f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}" 851 ) 852 chain_id = chain.id 853 854 return residue.id[1] in self.confident_residues.get(chain_id, [])
Class to select residues in the structure based on the input dictionary.
This is achieved by overriding the Bio.PDB.Select.accept_residue method
from Biopython.
Dictionary containing the chain ID as key and a list of residue
numbers as value.
e.g. {"A": [1, 2, 3], "B": [4, 5, 6]}
830 def accept_residue( 831 self, 832 residue: Bio.PDB.Residue.Residue 833 ) -> bool: 834 """Accept the residue if it's in `confident_residues`. 835 836 Arguments: 837 838 - **residue (Bio.PDB.Residue.Residue)**:<br /> 839 Biopython residue object. 840 841 Returns: 842 843 - **(bool)**:<br /> 844 `True` if the residue is in the `confident_residues`. 845 """ 846 847 chain = residue.parent 848 if not isinstance(chain, Chain): 849 raise TypeError( 850 f"Expected a Bio.PDB.Chain.Chain object, got {type(chain)}" 851 ) 852 chain_id = chain.id 853 854 return residue.id[1] in self.confident_residues.get(chain_id, [])
Accept the residue if it's in confident_residues.
Arguments:
- residue (Bio.PDB.Residue.Residue):
Biopython residue object.
Returns:
- (bool):
Trueif the residue is in theconfident_residues.