matrix_patches
Class to get interacting patches from a binary matrix.
1""" 2[matrix_patches](https://github.com/isblab/af_pipeline/tree/main/af_pipeline/tools/matrix_patches.py) 3============================== 4 5Class to get interacting patches from a binary matrix. 6""" 7 8import numpy as np 9import pandas as pd 10from collections import defaultdict 11from af_pipeline.utils.misc_utils import ( 12 get_key_from_res_range, 13 get_res_range_from_key, 14) 15from af_pipeline.constants.af_constants import MiscStrEnum 16 17class MatrixPatches: 18 """Class to get interacting patches from a binary matrix""" 19 20 matrix: np.ndarray 21 """ Binary matrix where rows and columns represent different objects 22 (e.g., chains in a protein complex). """ 23 24 row_obj: str 25 """ Identifier for the rows in the matrix. """ 26 27 col_obj: str 28 """ Identifier for the columns in the matrix. """ 29 30 def __init__( 31 self, 32 matrix: np.ndarray, 33 row_obj: str = "row_obj", 34 col_obj: str = "col_obj", 35 ): 36 self.matrix = matrix 37 self.row_obj = row_obj 38 self.col_obj = col_obj 39 40 def get_patches_from_matrix(self): 41 """Get all interacting patches from a binary matrix 42 43 Arguments: 44 45 - **matrix (np.ndarray)**:<br /> 46 Binary matrix where rows and columns represent different objects 47 (e.g., chains in a protein complex). 48 49 - **row_obj (str)**:<br /> 50 Identifier for the rows in the matrix 51 52 - **col_obj (str)**:<br /> 53 Identifier for the columns in the matrix 54 55 Returns: 56 57 - **patches (dict)**:<br /> 58 Dictionary of interacting patches 59 60 Example: 61 62 >>> matrix = np.array([ 63 ... [0, 0, 0, 1], 64 ... [0, 1, 1, 1], 65 ... [0, 0, 1, 1], 66 ... [0, 1, 0, 0], 67 ... [0, 1, 0, 1] 68 ... ]) 69 >>> matrix_patches = MatrixPatches( 70 ... matrix, row_obj="A", col_obj="B" 71 ... ) 72 >>> matrix_patches.get_patches_from_matrix() 73 A B 74 0 {0, 1, 2} {3} 75 1 {1} {1, 2, 3} 76 2 {1, 2} {2, 3} 77 3 {3, 4} {1} 78 4 {4} {3} 79 """ 80 81 assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), ( 82 f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}" 83 ) 84 85 row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0) 86 col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1) 87 88 split_row_sets = self.extend_one_sets_by_subsets(row_sets) 89 split_col_sets = self.extend_one_sets_by_subsets(col_sets) 90 91 df_row = self.one_sets_to_df( 92 split_row_sets, [self.row_obj, self.col_obj] 93 ) 94 df_col = self.one_sets_to_df( 95 split_col_sets, [self.col_obj, self.row_obj] 96 ) 97 98 df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj) 99 df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj) 100 101 combined_df = self.combine_dfs( 102 df_row, df_col, self.row_obj, self.col_obj 103 ) 104 105 for col in [self.row_obj, self.col_obj]: 106 combined_df[col] = combined_df[col].apply( 107 get_res_range_from_key, return_type="set" 108 ) 109 110 combined_df = self.remove_subset_rows( 111 combined_df, self.row_obj, self.col_obj 112 ) 113 114 return combined_df 115 116 @staticmethod 117 def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0): 118 """Get the indices of 1s in a binary matrix rowwise or columnwise. 119 120 Arguments: 121 122 - **matrix (np.ndarray)**:<br /> 123 Binary matrix where rows and columns represent different objects 124 (e.g., chains in a protein complex). 125 126 - **axis (int, optional)**:<br /> 127 0 for rowwise, 1 for columnwise. 128 129 Returns: 130 131 - **one_sets (dict)**:<br /> 132 `{k:v}` where `v` is a set of indices of 1s for key `k`. 133 134 Example: 135 136 >>> matrix = np.array([ 137 ... [1, 0, 1], 138 ... [0, 1, 0], 139 ... [1, 1, 0] 140 ... ]) 141 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0) 142 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}} 143 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1) 144 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}} 145 """ 146 147 assert np.isin(matrix, [0, 1]).all() and np.any(matrix), ( 148 f"Matrix must be binary and non-empty, got {np.unique(matrix)}" 149 ) 150 151 one_sets = {} 152 153 if axis == 0: # row_sets 154 for i in range(matrix.shape[0]): 155 one_sets[i] = set(np.where(matrix[i] == 1)[0]) 156 157 elif axis == 1: # col_sets 158 for j in range(matrix.shape[1]): 159 one_sets[j] = set(np.where(matrix[:, j] == 1)[0]) 160 161 return one_sets 162 163 @staticmethod 164 def extend_one_sets_by_subsets(one_sets: dict) -> dict: 165 """Add the subsets of the sets in list_of_sets to the one_sets. 166 167 Arguments: 168 169 - **one_sets (dict)**:<br /> 170 `{k:v}` where `v` is a set of indices of 1s for key `k`. 171 172 Returns: 173 174 - **new_one_sets (dict)**:<br /> 175 `{k:v}` where `v` is a list of sets of indices of 1s for key `k` 176 each set is a subset of the original set and is present in 177 the values of `one_sets`. 178 179 Example: 180 181 >>> one_sets = { 182 ... 0: {0, 1, 2, 3, 5, 6}, 183 ... 1: {1}, 184 ... 2: {0, 1} 185 ... } 186 >>> MatrixPatches.extend_one_sets_by_subsets(one_sets) 187 {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]} 188 """ 189 190 split_sets = MatrixPatches.split_one_sets(one_sets) 191 192 new_one_sets = defaultdict(list) 193 list_of_sets = [] # unique sets from split_sets 194 list_of_sets = [ 195 set(x) 196 for xs in split_sets.values() 197 for x in xs 198 if set(x) not in list_of_sets 199 ] 200 201 for set1 in list_of_sets: 202 for idx, one_set in one_sets.items(): 203 if set1.issubset(one_set): 204 ( 205 new_one_sets[idx].append(set1) 206 if set1 not in new_one_sets[idx] 207 else None 208 ) 209 210 return dict(new_one_sets) 211 212 @staticmethod 213 def split_one_sets(one_sets: dict) -> dict: 214 """Split the sets in `one_sets` into sub-sets such that 215 each subset only contains consecutive indices. 216 217 Arguments: 218 219 - **one_sets (dict)**:<br /> 220 `{k:v}` where `v` is a set of indices of 1s for key `k`. 221 222 Returns: 223 224 - **new_one_sets (dict)**:<br /> 225 dictionary of lists of lists where each list contains the 226 indices of 1s. 227 228 Example: 229 230 >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}} 231 >>> MatrixPatches.split_one_sets(one_sets) 232 {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]} 233 """ 234 235 new_one_sets = {} 236 237 for i, one_set in one_sets.items(): 238 239 if not isinstance(one_set, set): 240 raise TypeError("one_set must be a set") 241 242 sub_sets = MatrixPatches.split_one_set(one_set) 243 new_one_sets[i] = sub_sets 244 245 return new_one_sets 246 247 @staticmethod 248 def split_one_set(one_set: set | list) -> list: 249 """Split a set of indices into sub-sets such that 250 each subset only contains consecutive indices. 251 252 Arguments: 253 254 - **one_set (set | list)**:<br /> 255 Set of indices of 1s. 256 257 Returns: 258 259 - **sub_sets (list)**:<br /> 260 List of lists where each list contains the indices of 1s. 261 262 Example: 263 264 >>> one_set = {0, 1, 2, 3, 5, 6} \n 265 >>> MatrixPatches.split_one_set(one_set) 266 [[0, 1, 2, 3], [5, 6]] 267 """ 268 269 assert isinstance( 270 one_set, set | list 271 ), "one_set must be a set or a list" 272 273 sub_sets = [] 274 275 if isinstance(one_set, list): 276 # need to remove duplicates if any 277 one_set = set(one_set) 278 279 one_set = sorted(list(one_set)) 280 281 for idx, one_pos in enumerate(one_set): 282 283 curr_pos = one_pos 284 prev_pos = one_set[idx - 1] if idx > 0 else None 285 286 if idx == 0: 287 # If it's the first position, create a new sub-set 288 sub_sets.append([curr_pos]) 289 290 elif curr_pos - prev_pos == 1: 291 # If the current position is consecutive to the previous one 292 # add it to the last sub-set 293 sub_sets[-1].append(one_pos) 294 295 else: 296 # If the current position is not consecutive to the previous one 297 # create a new sub-set 298 sub_sets.append([curr_pos]) 299 300 return sub_sets 301 302 @staticmethod 303 def one_sets_to_df( 304 one_sets: dict, 305 columns: list 306 ): 307 """Convert a dictionary to a pandas DataFrame. 308 309 Arguments: 310 311 - **one_sets (dict)**:<br /> 312 Dictionary to convert. 313 314 - **columns (list)**:<br /> 315 Column names. 316 317 Returns: 318 319 - df (pd.DataFrame)**:<br /> 320 DataFrame with the dictionary keys as first column and values 321 as second column in columns. 322 323 Example: 324 325 >>> one_sets = { 326 ... 1: [{1, 2}, {5}], 327 ... 2: [{4, 5}, {6}] 328 ... } 329 >>> columns = ["A", "B"] 330 >>> MatrixPatches.one_sets_to_df(one_sets, columns) 331 A B 332 0 1 {1, 2} 333 1 1 {5} 334 2 2 {4, 5} 335 3 2 {6} 336 """ 337 338 if all([isinstance(val, list) for val in one_sets.values()]): 339 340 df_rows = [] 341 342 for k, v in one_sets.items(): 343 for val in v: 344 df_rows.append([str(k), val]) 345 346 df = pd.DataFrame(df_rows, columns=columns) 347 348 else: 349 raise ValueError("All values in the dictionary must be lists.") 350 351 return df 352 353 @staticmethod 354 def aggregate_df_rows( 355 df: pd.DataFrame, 356 groupby_col: str, 357 agg_col: str 358 ): 359 """Group a DataFrame by a column and aggregate another column. 360 361 Arguments: 362 363 - **df (pd.DataFrame)**:<br /> 364 DataFrame with groupby_col and agg_col. 365 366 - **groupby_col (str)**:<br /> 367 Column to group by (each value is a set). 368 369 - **agg_col (str)**:<br /> 370 Column to aggregate (each value is a string). 371 372 Returns: 373 374 - **df_group (pd.DataFrame)**:<br /> 375 Grouped DataFrame with both columns as a set. 376 377 Example: 378 379 >>> df = pd.DataFrame({ 380 ... "A": ["1", "1", "1", "2", "3", "4"], 381 ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}] 382 ... }) 383 >>> MatrixPatches.aggregate_df_rows(df, "B", "A") 384 B A 385 0 {1} {1} 386 1 {1, 2} {1, 3, 4} 387 2 {4, 5} {2} 388 3 {5} {1} 389 """ 390 391 df_group = ( 392 df.groupby(df[groupby_col].map(tuple))[agg_col] 393 .apply(",".join) 394 .reset_index() 395 ) 396 df_group[agg_col] = df_group[agg_col].astype(object) 397 for idx, row in df_group.iterrows(): 398 one_set = row[agg_col].split(",") 399 one_set = [int(x) for x in one_set] 400 one_set = sorted(one_set) 401 df_group.at[idx, agg_col] = set(one_set) 402 403 df_group[groupby_col] = df_group[groupby_col].apply(set) 404 405 return df_group 406 407 @staticmethod 408 def combine_dfs( 409 df1: pd.DataFrame, 410 df2: pd.DataFrame, 411 colname_1: str, 412 colname_2: str 413 ): 414 """Combine two DataFrames with columns colname_1 and colname_2 415 into a new DataFrame with interacting residues ranges without duplicates. 416 417 Arguments: 418 419 - **df1 (pd.DataFrame)**:<br /> 420 DataFrame 1. 421 422 - **df2 (pd.DataFrame)**:<br /> 423 DataFrame 2. 424 425 - **colname_1 (str)**:<br /> 426 Column name 1. 427 428 - **colname_2 (str)**:<br /> 429 Column name 2. 430 431 Returns: 432 433 - **new_df (pd.DataFrame)**:<br /> 434 Combined DataFrame of interacting residues ranges without 435 duplicates. 436 437 Example: 438 439 >>> df1 = pd.DataFrame({ 440 ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}], 441 ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}] 442 ... }) 443 >>> df2 = pd.DataFrame({ 444 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}], 445 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}] 446 ... }) 447 448 >>> MatrixPatches.combine_dfs(df1, df2, "A", "B") 449 A B 450 0 0-2 3 451 1 1 1-3 452 2 1-2 2-3 453 3 3-4 1 454 4 4 1 455 5 4 3 456 6 1 1 457 """ 458 459 combined_df = pd.concat([df2, df1], axis=0) 460 combined_df.reset_index(drop=True, inplace=True) 461 462 df_rows = [] 463 464 for _, row in combined_df.iterrows(): 465 if isinstance(row[colname_1], set) and isinstance( 466 row[colname_2], set 467 ): 468 ranges1 = get_key_from_res_range(row[colname_1], as_list=True) 469 ranges2 = get_key_from_res_range(row[colname_2], as_list=True) 470 assert isinstance(ranges1, list) and isinstance(ranges2, list) 471 for res_range1 in ranges1: 472 for res_range2 in ranges2: 473 df_rows.append([res_range1, res_range2]) 474 475 new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2]) 476 new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST) 477 new_df.reset_index(drop=True, inplace=True) 478 479 return new_df 480 481 @staticmethod 482 def remove_subset_rows( 483 df: pd.DataFrame, 484 colname_1: str, 485 colname_2: str 486 ): 487 """Remove rows that are subsets of other rows. 488 (from chatgpt) 489 490 Arguments: 491 492 - **df (pd.DataFrame)**:<br /> 493 DataFrame with columns `colname_1` and `colname_2`. 494 495 - **colname_1 (str)**:<br /> 496 column name 1. 497 498 - **colname_2 (str)**:<br /> 499 column name 2. 500 501 Returns: 502 503 - **filtered_df (pd.DataFrame)**:<br /> 504 DataFrame with subset rows removed. 505 506 Example: 507 508 >>> df = pd.DataFrame({ 509 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}], 510 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}] 511 ... }) 512 >>> MatrixPatches.remove_subset_rows(df, "A", "B") 513 A B 514 0 {0, 1, 2} {3} 515 1 {1} {1, 2, 3} 516 2 {1, 2} {2, 3} 517 3 {3, 4} {1} 518 4 {4} {3} 519 """ 520 521 rows_to_keep = [] 522 523 for i, row in df.iterrows(): 524 525 if not any( 526 MatrixPatches.is_subset( 527 row, df.iloc[j], colname_1, colname_2 528 ) 529 for j in range(len(df)) 530 if i != j 531 ): 532 rows_to_keep.append(i) 533 534 filtered_df = df.loc[rows_to_keep].reset_index(drop=True) 535 536 return filtered_df 537 538 @staticmethod 539 def is_subset( 540 row: pd.Series, 541 other_row: pd.Series, 542 colname_1: str, 543 colname_2: str, 544 ): 545 """Check if row is a subset of other_row for two specified columns. 546 547 Arguments: 548 549 - **row (pd.Series)**:<br /> 550 Row to check if it is a subset of other_row. 551 552 - **other_row (pd.Series)**:<br /> 553 Row to check against. 554 555 - **colname_1 (str)**:<br /> 556 Column name 1. 557 558 - **colname_2 (str)**:<br /> 559 Column name 2. 560 561 Returns: 562 563 - **(bool)**:<br /> 564 `True` if row is a subset of `other_row`, `False` otherwise. 565 566 Example: 567 568 >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}}) 569 >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}}) 570 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 571 True 572 >>> other_row = pd.Series({"A": {1, 2}, "B": {3}}) 573 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 574 False 575 """ 576 577 return ( 578 row[colname_1].issubset(other_row[colname_1]) 579 and row[colname_2].issubset(other_row[colname_2]) 580 ) 581 582 583if __name__ == "__main__": 584 585 import doctest 586 doctest.testmod()
18class MatrixPatches: 19 """Class to get interacting patches from a binary matrix""" 20 21 matrix: np.ndarray 22 """ Binary matrix where rows and columns represent different objects 23 (e.g., chains in a protein complex). """ 24 25 row_obj: str 26 """ Identifier for the rows in the matrix. """ 27 28 col_obj: str 29 """ Identifier for the columns in the matrix. """ 30 31 def __init__( 32 self, 33 matrix: np.ndarray, 34 row_obj: str = "row_obj", 35 col_obj: str = "col_obj", 36 ): 37 self.matrix = matrix 38 self.row_obj = row_obj 39 self.col_obj = col_obj 40 41 def get_patches_from_matrix(self): 42 """Get all interacting patches from a binary matrix 43 44 Arguments: 45 46 - **matrix (np.ndarray)**:<br /> 47 Binary matrix where rows and columns represent different objects 48 (e.g., chains in a protein complex). 49 50 - **row_obj (str)**:<br /> 51 Identifier for the rows in the matrix 52 53 - **col_obj (str)**:<br /> 54 Identifier for the columns in the matrix 55 56 Returns: 57 58 - **patches (dict)**:<br /> 59 Dictionary of interacting patches 60 61 Example: 62 63 >>> matrix = np.array([ 64 ... [0, 0, 0, 1], 65 ... [0, 1, 1, 1], 66 ... [0, 0, 1, 1], 67 ... [0, 1, 0, 0], 68 ... [0, 1, 0, 1] 69 ... ]) 70 >>> matrix_patches = MatrixPatches( 71 ... matrix, row_obj="A", col_obj="B" 72 ... ) 73 >>> matrix_patches.get_patches_from_matrix() 74 A B 75 0 {0, 1, 2} {3} 76 1 {1} {1, 2, 3} 77 2 {1, 2} {2, 3} 78 3 {3, 4} {1} 79 4 {4} {3} 80 """ 81 82 assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), ( 83 f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}" 84 ) 85 86 row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0) 87 col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1) 88 89 split_row_sets = self.extend_one_sets_by_subsets(row_sets) 90 split_col_sets = self.extend_one_sets_by_subsets(col_sets) 91 92 df_row = self.one_sets_to_df( 93 split_row_sets, [self.row_obj, self.col_obj] 94 ) 95 df_col = self.one_sets_to_df( 96 split_col_sets, [self.col_obj, self.row_obj] 97 ) 98 99 df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj) 100 df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj) 101 102 combined_df = self.combine_dfs( 103 df_row, df_col, self.row_obj, self.col_obj 104 ) 105 106 for col in [self.row_obj, self.col_obj]: 107 combined_df[col] = combined_df[col].apply( 108 get_res_range_from_key, return_type="set" 109 ) 110 111 combined_df = self.remove_subset_rows( 112 combined_df, self.row_obj, self.col_obj 113 ) 114 115 return combined_df 116 117 @staticmethod 118 def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0): 119 """Get the indices of 1s in a binary matrix rowwise or columnwise. 120 121 Arguments: 122 123 - **matrix (np.ndarray)**:<br /> 124 Binary matrix where rows and columns represent different objects 125 (e.g., chains in a protein complex). 126 127 - **axis (int, optional)**:<br /> 128 0 for rowwise, 1 for columnwise. 129 130 Returns: 131 132 - **one_sets (dict)**:<br /> 133 `{k:v}` where `v` is a set of indices of 1s for key `k`. 134 135 Example: 136 137 >>> matrix = np.array([ 138 ... [1, 0, 1], 139 ... [0, 1, 0], 140 ... [1, 1, 0] 141 ... ]) 142 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0) 143 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}} 144 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1) 145 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}} 146 """ 147 148 assert np.isin(matrix, [0, 1]).all() and np.any(matrix), ( 149 f"Matrix must be binary and non-empty, got {np.unique(matrix)}" 150 ) 151 152 one_sets = {} 153 154 if axis == 0: # row_sets 155 for i in range(matrix.shape[0]): 156 one_sets[i] = set(np.where(matrix[i] == 1)[0]) 157 158 elif axis == 1: # col_sets 159 for j in range(matrix.shape[1]): 160 one_sets[j] = set(np.where(matrix[:, j] == 1)[0]) 161 162 return one_sets 163 164 @staticmethod 165 def extend_one_sets_by_subsets(one_sets: dict) -> dict: 166 """Add the subsets of the sets in list_of_sets to the one_sets. 167 168 Arguments: 169 170 - **one_sets (dict)**:<br /> 171 `{k:v}` where `v` is a set of indices of 1s for key `k`. 172 173 Returns: 174 175 - **new_one_sets (dict)**:<br /> 176 `{k:v}` where `v` is a list of sets of indices of 1s for key `k` 177 each set is a subset of the original set and is present in 178 the values of `one_sets`. 179 180 Example: 181 182 >>> one_sets = { 183 ... 0: {0, 1, 2, 3, 5, 6}, 184 ... 1: {1}, 185 ... 2: {0, 1} 186 ... } 187 >>> MatrixPatches.extend_one_sets_by_subsets(one_sets) 188 {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]} 189 """ 190 191 split_sets = MatrixPatches.split_one_sets(one_sets) 192 193 new_one_sets = defaultdict(list) 194 list_of_sets = [] # unique sets from split_sets 195 list_of_sets = [ 196 set(x) 197 for xs in split_sets.values() 198 for x in xs 199 if set(x) not in list_of_sets 200 ] 201 202 for set1 in list_of_sets: 203 for idx, one_set in one_sets.items(): 204 if set1.issubset(one_set): 205 ( 206 new_one_sets[idx].append(set1) 207 if set1 not in new_one_sets[idx] 208 else None 209 ) 210 211 return dict(new_one_sets) 212 213 @staticmethod 214 def split_one_sets(one_sets: dict) -> dict: 215 """Split the sets in `one_sets` into sub-sets such that 216 each subset only contains consecutive indices. 217 218 Arguments: 219 220 - **one_sets (dict)**:<br /> 221 `{k:v}` where `v` is a set of indices of 1s for key `k`. 222 223 Returns: 224 225 - **new_one_sets (dict)**:<br /> 226 dictionary of lists of lists where each list contains the 227 indices of 1s. 228 229 Example: 230 231 >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}} 232 >>> MatrixPatches.split_one_sets(one_sets) 233 {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]} 234 """ 235 236 new_one_sets = {} 237 238 for i, one_set in one_sets.items(): 239 240 if not isinstance(one_set, set): 241 raise TypeError("one_set must be a set") 242 243 sub_sets = MatrixPatches.split_one_set(one_set) 244 new_one_sets[i] = sub_sets 245 246 return new_one_sets 247 248 @staticmethod 249 def split_one_set(one_set: set | list) -> list: 250 """Split a set of indices into sub-sets such that 251 each subset only contains consecutive indices. 252 253 Arguments: 254 255 - **one_set (set | list)**:<br /> 256 Set of indices of 1s. 257 258 Returns: 259 260 - **sub_sets (list)**:<br /> 261 List of lists where each list contains the indices of 1s. 262 263 Example: 264 265 >>> one_set = {0, 1, 2, 3, 5, 6} \n 266 >>> MatrixPatches.split_one_set(one_set) 267 [[0, 1, 2, 3], [5, 6]] 268 """ 269 270 assert isinstance( 271 one_set, set | list 272 ), "one_set must be a set or a list" 273 274 sub_sets = [] 275 276 if isinstance(one_set, list): 277 # need to remove duplicates if any 278 one_set = set(one_set) 279 280 one_set = sorted(list(one_set)) 281 282 for idx, one_pos in enumerate(one_set): 283 284 curr_pos = one_pos 285 prev_pos = one_set[idx - 1] if idx > 0 else None 286 287 if idx == 0: 288 # If it's the first position, create a new sub-set 289 sub_sets.append([curr_pos]) 290 291 elif curr_pos - prev_pos == 1: 292 # If the current position is consecutive to the previous one 293 # add it to the last sub-set 294 sub_sets[-1].append(one_pos) 295 296 else: 297 # If the current position is not consecutive to the previous one 298 # create a new sub-set 299 sub_sets.append([curr_pos]) 300 301 return sub_sets 302 303 @staticmethod 304 def one_sets_to_df( 305 one_sets: dict, 306 columns: list 307 ): 308 """Convert a dictionary to a pandas DataFrame. 309 310 Arguments: 311 312 - **one_sets (dict)**:<br /> 313 Dictionary to convert. 314 315 - **columns (list)**:<br /> 316 Column names. 317 318 Returns: 319 320 - df (pd.DataFrame)**:<br /> 321 DataFrame with the dictionary keys as first column and values 322 as second column in columns. 323 324 Example: 325 326 >>> one_sets = { 327 ... 1: [{1, 2}, {5}], 328 ... 2: [{4, 5}, {6}] 329 ... } 330 >>> columns = ["A", "B"] 331 >>> MatrixPatches.one_sets_to_df(one_sets, columns) 332 A B 333 0 1 {1, 2} 334 1 1 {5} 335 2 2 {4, 5} 336 3 2 {6} 337 """ 338 339 if all([isinstance(val, list) for val in one_sets.values()]): 340 341 df_rows = [] 342 343 for k, v in one_sets.items(): 344 for val in v: 345 df_rows.append([str(k), val]) 346 347 df = pd.DataFrame(df_rows, columns=columns) 348 349 else: 350 raise ValueError("All values in the dictionary must be lists.") 351 352 return df 353 354 @staticmethod 355 def aggregate_df_rows( 356 df: pd.DataFrame, 357 groupby_col: str, 358 agg_col: str 359 ): 360 """Group a DataFrame by a column and aggregate another column. 361 362 Arguments: 363 364 - **df (pd.DataFrame)**:<br /> 365 DataFrame with groupby_col and agg_col. 366 367 - **groupby_col (str)**:<br /> 368 Column to group by (each value is a set). 369 370 - **agg_col (str)**:<br /> 371 Column to aggregate (each value is a string). 372 373 Returns: 374 375 - **df_group (pd.DataFrame)**:<br /> 376 Grouped DataFrame with both columns as a set. 377 378 Example: 379 380 >>> df = pd.DataFrame({ 381 ... "A": ["1", "1", "1", "2", "3", "4"], 382 ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}] 383 ... }) 384 >>> MatrixPatches.aggregate_df_rows(df, "B", "A") 385 B A 386 0 {1} {1} 387 1 {1, 2} {1, 3, 4} 388 2 {4, 5} {2} 389 3 {5} {1} 390 """ 391 392 df_group = ( 393 df.groupby(df[groupby_col].map(tuple))[agg_col] 394 .apply(",".join) 395 .reset_index() 396 ) 397 df_group[agg_col] = df_group[agg_col].astype(object) 398 for idx, row in df_group.iterrows(): 399 one_set = row[agg_col].split(",") 400 one_set = [int(x) for x in one_set] 401 one_set = sorted(one_set) 402 df_group.at[idx, agg_col] = set(one_set) 403 404 df_group[groupby_col] = df_group[groupby_col].apply(set) 405 406 return df_group 407 408 @staticmethod 409 def combine_dfs( 410 df1: pd.DataFrame, 411 df2: pd.DataFrame, 412 colname_1: str, 413 colname_2: str 414 ): 415 """Combine two DataFrames with columns colname_1 and colname_2 416 into a new DataFrame with interacting residues ranges without duplicates. 417 418 Arguments: 419 420 - **df1 (pd.DataFrame)**:<br /> 421 DataFrame 1. 422 423 - **df2 (pd.DataFrame)**:<br /> 424 DataFrame 2. 425 426 - **colname_1 (str)**:<br /> 427 Column name 1. 428 429 - **colname_2 (str)**:<br /> 430 Column name 2. 431 432 Returns: 433 434 - **new_df (pd.DataFrame)**:<br /> 435 Combined DataFrame of interacting residues ranges without 436 duplicates. 437 438 Example: 439 440 >>> df1 = pd.DataFrame({ 441 ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}], 442 ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}] 443 ... }) 444 >>> df2 = pd.DataFrame({ 445 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}], 446 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}] 447 ... }) 448 449 >>> MatrixPatches.combine_dfs(df1, df2, "A", "B") 450 A B 451 0 0-2 3 452 1 1 1-3 453 2 1-2 2-3 454 3 3-4 1 455 4 4 1 456 5 4 3 457 6 1 1 458 """ 459 460 combined_df = pd.concat([df2, df1], axis=0) 461 combined_df.reset_index(drop=True, inplace=True) 462 463 df_rows = [] 464 465 for _, row in combined_df.iterrows(): 466 if isinstance(row[colname_1], set) and isinstance( 467 row[colname_2], set 468 ): 469 ranges1 = get_key_from_res_range(row[colname_1], as_list=True) 470 ranges2 = get_key_from_res_range(row[colname_2], as_list=True) 471 assert isinstance(ranges1, list) and isinstance(ranges2, list) 472 for res_range1 in ranges1: 473 for res_range2 in ranges2: 474 df_rows.append([res_range1, res_range2]) 475 476 new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2]) 477 new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST) 478 new_df.reset_index(drop=True, inplace=True) 479 480 return new_df 481 482 @staticmethod 483 def remove_subset_rows( 484 df: pd.DataFrame, 485 colname_1: str, 486 colname_2: str 487 ): 488 """Remove rows that are subsets of other rows. 489 (from chatgpt) 490 491 Arguments: 492 493 - **df (pd.DataFrame)**:<br /> 494 DataFrame with columns `colname_1` and `colname_2`. 495 496 - **colname_1 (str)**:<br /> 497 column name 1. 498 499 - **colname_2 (str)**:<br /> 500 column name 2. 501 502 Returns: 503 504 - **filtered_df (pd.DataFrame)**:<br /> 505 DataFrame with subset rows removed. 506 507 Example: 508 509 >>> df = pd.DataFrame({ 510 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}], 511 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}] 512 ... }) 513 >>> MatrixPatches.remove_subset_rows(df, "A", "B") 514 A B 515 0 {0, 1, 2} {3} 516 1 {1} {1, 2, 3} 517 2 {1, 2} {2, 3} 518 3 {3, 4} {1} 519 4 {4} {3} 520 """ 521 522 rows_to_keep = [] 523 524 for i, row in df.iterrows(): 525 526 if not any( 527 MatrixPatches.is_subset( 528 row, df.iloc[j], colname_1, colname_2 529 ) 530 for j in range(len(df)) 531 if i != j 532 ): 533 rows_to_keep.append(i) 534 535 filtered_df = df.loc[rows_to_keep].reset_index(drop=True) 536 537 return filtered_df 538 539 @staticmethod 540 def is_subset( 541 row: pd.Series, 542 other_row: pd.Series, 543 colname_1: str, 544 colname_2: str, 545 ): 546 """Check if row is a subset of other_row for two specified columns. 547 548 Arguments: 549 550 - **row (pd.Series)**:<br /> 551 Row to check if it is a subset of other_row. 552 553 - **other_row (pd.Series)**:<br /> 554 Row to check against. 555 556 - **colname_1 (str)**:<br /> 557 Column name 1. 558 559 - **colname_2 (str)**:<br /> 560 Column name 2. 561 562 Returns: 563 564 - **(bool)**:<br /> 565 `True` if row is a subset of `other_row`, `False` otherwise. 566 567 Example: 568 569 >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}}) 570 >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}}) 571 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 572 True 573 >>> other_row = pd.Series({"A": {1, 2}, "B": {3}}) 574 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 575 False 576 """ 577 578 return ( 579 row[colname_1].issubset(other_row[colname_1]) 580 and row[colname_2].issubset(other_row[colname_2]) 581 )
Class to get interacting patches from a binary matrix
Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).
41 def get_patches_from_matrix(self): 42 """Get all interacting patches from a binary matrix 43 44 Arguments: 45 46 - **matrix (np.ndarray)**:<br /> 47 Binary matrix where rows and columns represent different objects 48 (e.g., chains in a protein complex). 49 50 - **row_obj (str)**:<br /> 51 Identifier for the rows in the matrix 52 53 - **col_obj (str)**:<br /> 54 Identifier for the columns in the matrix 55 56 Returns: 57 58 - **patches (dict)**:<br /> 59 Dictionary of interacting patches 60 61 Example: 62 63 >>> matrix = np.array([ 64 ... [0, 0, 0, 1], 65 ... [0, 1, 1, 1], 66 ... [0, 0, 1, 1], 67 ... [0, 1, 0, 0], 68 ... [0, 1, 0, 1] 69 ... ]) 70 >>> matrix_patches = MatrixPatches( 71 ... matrix, row_obj="A", col_obj="B" 72 ... ) 73 >>> matrix_patches.get_patches_from_matrix() 74 A B 75 0 {0, 1, 2} {3} 76 1 {1} {1, 2, 3} 77 2 {1, 2} {2, 3} 78 3 {3, 4} {1} 79 4 {4} {3} 80 """ 81 82 assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), ( 83 f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}" 84 ) 85 86 row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0) 87 col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1) 88 89 split_row_sets = self.extend_one_sets_by_subsets(row_sets) 90 split_col_sets = self.extend_one_sets_by_subsets(col_sets) 91 92 df_row = self.one_sets_to_df( 93 split_row_sets, [self.row_obj, self.col_obj] 94 ) 95 df_col = self.one_sets_to_df( 96 split_col_sets, [self.col_obj, self.row_obj] 97 ) 98 99 df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj) 100 df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj) 101 102 combined_df = self.combine_dfs( 103 df_row, df_col, self.row_obj, self.col_obj 104 ) 105 106 for col in [self.row_obj, self.col_obj]: 107 combined_df[col] = combined_df[col].apply( 108 get_res_range_from_key, return_type="set" 109 ) 110 111 combined_df = self.remove_subset_rows( 112 combined_df, self.row_obj, self.col_obj 113 ) 114 115 return combined_df
Get all interacting patches from a binary matrix
Arguments:
matrix (np.ndarray):
Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).row_obj (str):
Identifier for the rows in the matrixcol_obj (str):
Identifier for the columns in the matrix
Returns:
- patches (dict):
Dictionary of interacting patches
Example:
>>> matrix = np.array([
... [0, 0, 0, 1],
... [0, 1, 1, 1],
... [0, 0, 1, 1],
... [0, 1, 0, 0],
... [0, 1, 0, 1]
... ])
>>> matrix_patches = MatrixPatches(
... matrix, row_obj="A", col_obj="B"
... )
>>> matrix_patches.get_patches_from_matrix()
A B
0 {0, 1, 2} {3}
1 {1} {1, 2, 3}
2 {1, 2} {2, 3}
3 {3, 4} {1}
4 {4} {3}
117 @staticmethod 118 def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0): 119 """Get the indices of 1s in a binary matrix rowwise or columnwise. 120 121 Arguments: 122 123 - **matrix (np.ndarray)**:<br /> 124 Binary matrix where rows and columns represent different objects 125 (e.g., chains in a protein complex). 126 127 - **axis (int, optional)**:<br /> 128 0 for rowwise, 1 for columnwise. 129 130 Returns: 131 132 - **one_sets (dict)**:<br /> 133 `{k:v}` where `v` is a set of indices of 1s for key `k`. 134 135 Example: 136 137 >>> matrix = np.array([ 138 ... [1, 0, 1], 139 ... [0, 1, 0], 140 ... [1, 1, 0] 141 ... ]) 142 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0) 143 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}} 144 >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1) 145 {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}} 146 """ 147 148 assert np.isin(matrix, [0, 1]).all() and np.any(matrix), ( 149 f"Matrix must be binary and non-empty, got {np.unique(matrix)}" 150 ) 151 152 one_sets = {} 153 154 if axis == 0: # row_sets 155 for i in range(matrix.shape[0]): 156 one_sets[i] = set(np.where(matrix[i] == 1)[0]) 157 158 elif axis == 1: # col_sets 159 for j in range(matrix.shape[1]): 160 one_sets[j] = set(np.where(matrix[:, j] == 1)[0]) 161 162 return one_sets
Get the indices of 1s in a binary matrix rowwise or columnwise.
Arguments:
matrix (np.ndarray):
Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).axis (int, optional):
0 for rowwise, 1 for columnwise.
Returns:
- one_sets (dict):
{k:v}wherevis a set of indices of 1s for keyk.
Example:
>>> matrix = np.array([
... [1, 0, 1],
... [0, 1, 0],
... [1, 1, 0]
... ])
>>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0)
{0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}}
>>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1)
{0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}}
164 @staticmethod 165 def extend_one_sets_by_subsets(one_sets: dict) -> dict: 166 """Add the subsets of the sets in list_of_sets to the one_sets. 167 168 Arguments: 169 170 - **one_sets (dict)**:<br /> 171 `{k:v}` where `v` is a set of indices of 1s for key `k`. 172 173 Returns: 174 175 - **new_one_sets (dict)**:<br /> 176 `{k:v}` where `v` is a list of sets of indices of 1s for key `k` 177 each set is a subset of the original set and is present in 178 the values of `one_sets`. 179 180 Example: 181 182 >>> one_sets = { 183 ... 0: {0, 1, 2, 3, 5, 6}, 184 ... 1: {1}, 185 ... 2: {0, 1} 186 ... } 187 >>> MatrixPatches.extend_one_sets_by_subsets(one_sets) 188 {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]} 189 """ 190 191 split_sets = MatrixPatches.split_one_sets(one_sets) 192 193 new_one_sets = defaultdict(list) 194 list_of_sets = [] # unique sets from split_sets 195 list_of_sets = [ 196 set(x) 197 for xs in split_sets.values() 198 for x in xs 199 if set(x) not in list_of_sets 200 ] 201 202 for set1 in list_of_sets: 203 for idx, one_set in one_sets.items(): 204 if set1.issubset(one_set): 205 ( 206 new_one_sets[idx].append(set1) 207 if set1 not in new_one_sets[idx] 208 else None 209 ) 210 211 return dict(new_one_sets)
Add the subsets of the sets in list_of_sets to the one_sets.
Arguments:
- one_sets (dict):
{k:v}wherevis a set of indices of 1s for keyk.
Returns:
- new_one_sets (dict):
{k:v}wherevis a list of sets of indices of 1s for keykeach set is a subset of the original set and is present in the values ofone_sets.
Example:
>>> one_sets = {
... 0: {0, 1, 2, 3, 5, 6},
... 1: {1},
... 2: {0, 1}
... }
>>> MatrixPatches.extend_one_sets_by_subsets(one_sets)
{0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]}
213 @staticmethod 214 def split_one_sets(one_sets: dict) -> dict: 215 """Split the sets in `one_sets` into sub-sets such that 216 each subset only contains consecutive indices. 217 218 Arguments: 219 220 - **one_sets (dict)**:<br /> 221 `{k:v}` where `v` is a set of indices of 1s for key `k`. 222 223 Returns: 224 225 - **new_one_sets (dict)**:<br /> 226 dictionary of lists of lists where each list contains the 227 indices of 1s. 228 229 Example: 230 231 >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}} 232 >>> MatrixPatches.split_one_sets(one_sets) 233 {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]} 234 """ 235 236 new_one_sets = {} 237 238 for i, one_set in one_sets.items(): 239 240 if not isinstance(one_set, set): 241 raise TypeError("one_set must be a set") 242 243 sub_sets = MatrixPatches.split_one_set(one_set) 244 new_one_sets[i] = sub_sets 245 246 return new_one_sets
Split the sets in one_sets into sub-sets such that
each subset only contains consecutive indices.
Arguments:
- one_sets (dict):
{k:v}wherevis a set of indices of 1s for keyk.
Returns:
- new_one_sets (dict):
dictionary of lists of lists where each list contains the indices of 1s.
Example:
>>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}}
>>> MatrixPatches.split_one_sets(one_sets)
{0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]}
248 @staticmethod 249 def split_one_set(one_set: set | list) -> list: 250 """Split a set of indices into sub-sets such that 251 each subset only contains consecutive indices. 252 253 Arguments: 254 255 - **one_set (set | list)**:<br /> 256 Set of indices of 1s. 257 258 Returns: 259 260 - **sub_sets (list)**:<br /> 261 List of lists where each list contains the indices of 1s. 262 263 Example: 264 265 >>> one_set = {0, 1, 2, 3, 5, 6} \n 266 >>> MatrixPatches.split_one_set(one_set) 267 [[0, 1, 2, 3], [5, 6]] 268 """ 269 270 assert isinstance( 271 one_set, set | list 272 ), "one_set must be a set or a list" 273 274 sub_sets = [] 275 276 if isinstance(one_set, list): 277 # need to remove duplicates if any 278 one_set = set(one_set) 279 280 one_set = sorted(list(one_set)) 281 282 for idx, one_pos in enumerate(one_set): 283 284 curr_pos = one_pos 285 prev_pos = one_set[idx - 1] if idx > 0 else None 286 287 if idx == 0: 288 # If it's the first position, create a new sub-set 289 sub_sets.append([curr_pos]) 290 291 elif curr_pos - prev_pos == 1: 292 # If the current position is consecutive to the previous one 293 # add it to the last sub-set 294 sub_sets[-1].append(one_pos) 295 296 else: 297 # If the current position is not consecutive to the previous one 298 # create a new sub-set 299 sub_sets.append([curr_pos]) 300 301 return sub_sets
Split a set of indices into sub-sets such that each subset only contains consecutive indices.
Arguments:
- one_set (set | list):
Set of indices of 1s.
Returns:
- sub_sets (list):
List of lists where each list contains the indices of 1s.
Example:
>>> one_set = {0, 1, 2, 3, 5, 6}
>>> MatrixPatches.split_one_set(one_set)
[[0, 1, 2, 3], [5, 6]]
303 @staticmethod 304 def one_sets_to_df( 305 one_sets: dict, 306 columns: list 307 ): 308 """Convert a dictionary to a pandas DataFrame. 309 310 Arguments: 311 312 - **one_sets (dict)**:<br /> 313 Dictionary to convert. 314 315 - **columns (list)**:<br /> 316 Column names. 317 318 Returns: 319 320 - df (pd.DataFrame)**:<br /> 321 DataFrame with the dictionary keys as first column and values 322 as second column in columns. 323 324 Example: 325 326 >>> one_sets = { 327 ... 1: [{1, 2}, {5}], 328 ... 2: [{4, 5}, {6}] 329 ... } 330 >>> columns = ["A", "B"] 331 >>> MatrixPatches.one_sets_to_df(one_sets, columns) 332 A B 333 0 1 {1, 2} 334 1 1 {5} 335 2 2 {4, 5} 336 3 2 {6} 337 """ 338 339 if all([isinstance(val, list) for val in one_sets.values()]): 340 341 df_rows = [] 342 343 for k, v in one_sets.items(): 344 for val in v: 345 df_rows.append([str(k), val]) 346 347 df = pd.DataFrame(df_rows, columns=columns) 348 349 else: 350 raise ValueError("All values in the dictionary must be lists.") 351 352 return df
Convert a dictionary to a pandas DataFrame.
Arguments:
one_sets (dict):
Dictionary to convert.columns (list):
Column names.
Returns:
- df (pd.DataFrame)**:
DataFrame with the dictionary keys as first column and values as second column in columns.
Example:
>>> one_sets = {
... 1: [{1, 2}, {5}],
... 2: [{4, 5}, {6}]
... }
>>> columns = ["A", "B"]
>>> MatrixPatches.one_sets_to_df(one_sets, columns)
A B
0 1 {1, 2}
1 1 {5}
2 2 {4, 5}
3 2 {6}
354 @staticmethod 355 def aggregate_df_rows( 356 df: pd.DataFrame, 357 groupby_col: str, 358 agg_col: str 359 ): 360 """Group a DataFrame by a column and aggregate another column. 361 362 Arguments: 363 364 - **df (pd.DataFrame)**:<br /> 365 DataFrame with groupby_col and agg_col. 366 367 - **groupby_col (str)**:<br /> 368 Column to group by (each value is a set). 369 370 - **agg_col (str)**:<br /> 371 Column to aggregate (each value is a string). 372 373 Returns: 374 375 - **df_group (pd.DataFrame)**:<br /> 376 Grouped DataFrame with both columns as a set. 377 378 Example: 379 380 >>> df = pd.DataFrame({ 381 ... "A": ["1", "1", "1", "2", "3", "4"], 382 ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}] 383 ... }) 384 >>> MatrixPatches.aggregate_df_rows(df, "B", "A") 385 B A 386 0 {1} {1} 387 1 {1, 2} {1, 3, 4} 388 2 {4, 5} {2} 389 3 {5} {1} 390 """ 391 392 df_group = ( 393 df.groupby(df[groupby_col].map(tuple))[agg_col] 394 .apply(",".join) 395 .reset_index() 396 ) 397 df_group[agg_col] = df_group[agg_col].astype(object) 398 for idx, row in df_group.iterrows(): 399 one_set = row[agg_col].split(",") 400 one_set = [int(x) for x in one_set] 401 one_set = sorted(one_set) 402 df_group.at[idx, agg_col] = set(one_set) 403 404 df_group[groupby_col] = df_group[groupby_col].apply(set) 405 406 return df_group
Group a DataFrame by a column and aggregate another column.
Arguments:
df (pd.DataFrame):
DataFrame with groupby_col and agg_col.groupby_col (str):
Column to group by (each value is a set).agg_col (str):
Column to aggregate (each value is a string).
Returns:
- df_group (pd.DataFrame):
Grouped DataFrame with both columns as a set.
Example:
>>> df = pd.DataFrame({
... "A": ["1", "1", "1", "2", "3", "4"],
... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}]
... })
>>> MatrixPatches.aggregate_df_rows(df, "B", "A")
B A
0 {1} {1}
1 {1, 2} {1, 3, 4}
2 {4, 5} {2}
3 {5} {1}
408 @staticmethod 409 def combine_dfs( 410 df1: pd.DataFrame, 411 df2: pd.DataFrame, 412 colname_1: str, 413 colname_2: str 414 ): 415 """Combine two DataFrames with columns colname_1 and colname_2 416 into a new DataFrame with interacting residues ranges without duplicates. 417 418 Arguments: 419 420 - **df1 (pd.DataFrame)**:<br /> 421 DataFrame 1. 422 423 - **df2 (pd.DataFrame)**:<br /> 424 DataFrame 2. 425 426 - **colname_1 (str)**:<br /> 427 Column name 1. 428 429 - **colname_2 (str)**:<br /> 430 Column name 2. 431 432 Returns: 433 434 - **new_df (pd.DataFrame)**:<br /> 435 Combined DataFrame of interacting residues ranges without 436 duplicates. 437 438 Example: 439 440 >>> df1 = pd.DataFrame({ 441 ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}], 442 ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}] 443 ... }) 444 >>> df2 = pd.DataFrame({ 445 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}], 446 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}] 447 ... }) 448 449 >>> MatrixPatches.combine_dfs(df1, df2, "A", "B") 450 A B 451 0 0-2 3 452 1 1 1-3 453 2 1-2 2-3 454 3 3-4 1 455 4 4 1 456 5 4 3 457 6 1 1 458 """ 459 460 combined_df = pd.concat([df2, df1], axis=0) 461 combined_df.reset_index(drop=True, inplace=True) 462 463 df_rows = [] 464 465 for _, row in combined_df.iterrows(): 466 if isinstance(row[colname_1], set) and isinstance( 467 row[colname_2], set 468 ): 469 ranges1 = get_key_from_res_range(row[colname_1], as_list=True) 470 ranges2 = get_key_from_res_range(row[colname_2], as_list=True) 471 assert isinstance(ranges1, list) and isinstance(ranges2, list) 472 for res_range1 in ranges1: 473 for res_range2 in ranges2: 474 df_rows.append([res_range1, res_range2]) 475 476 new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2]) 477 new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST) 478 new_df.reset_index(drop=True, inplace=True) 479 480 return new_df
Combine two DataFrames with columns colname_1 and colname_2 into a new DataFrame with interacting residues ranges without duplicates.
Arguments:
df1 (pd.DataFrame):
DataFrame 1.df2 (pd.DataFrame):
DataFrame 2.colname_1 (str):
Column name 1.colname_2 (str):
Column name 2.
Returns:
- new_df (pd.DataFrame):
Combined DataFrame of interacting residues ranges without duplicates.
Example:
>>> df1 = pd.DataFrame({
... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}],
... "B":[{1}, {1, 2, 3}, {2, 3}, {3}]
... })
>>> df2 = pd.DataFrame({
... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}],
... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}]
... })
>>> MatrixPatches.combine_dfs(df1, df2, "A", "B")
A B
0 0-2 3
1 1 1-3
2 1-2 2-3
3 3-4 1
4 4 1
5 4 3
6 1 1
482 @staticmethod 483 def remove_subset_rows( 484 df: pd.DataFrame, 485 colname_1: str, 486 colname_2: str 487 ): 488 """Remove rows that are subsets of other rows. 489 (from chatgpt) 490 491 Arguments: 492 493 - **df (pd.DataFrame)**:<br /> 494 DataFrame with columns `colname_1` and `colname_2`. 495 496 - **colname_1 (str)**:<br /> 497 column name 1. 498 499 - **colname_2 (str)**:<br /> 500 column name 2. 501 502 Returns: 503 504 - **filtered_df (pd.DataFrame)**:<br /> 505 DataFrame with subset rows removed. 506 507 Example: 508 509 >>> df = pd.DataFrame({ 510 ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}], 511 ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}] 512 ... }) 513 >>> MatrixPatches.remove_subset_rows(df, "A", "B") 514 A B 515 0 {0, 1, 2} {3} 516 1 {1} {1, 2, 3} 517 2 {1, 2} {2, 3} 518 3 {3, 4} {1} 519 4 {4} {3} 520 """ 521 522 rows_to_keep = [] 523 524 for i, row in df.iterrows(): 525 526 if not any( 527 MatrixPatches.is_subset( 528 row, df.iloc[j], colname_1, colname_2 529 ) 530 for j in range(len(df)) 531 if i != j 532 ): 533 rows_to_keep.append(i) 534 535 filtered_df = df.loc[rows_to_keep].reset_index(drop=True) 536 537 return filtered_df
Remove rows that are subsets of other rows. (from chatgpt)
Arguments:
df (pd.DataFrame):
DataFrame with columnscolname_1andcolname_2.colname_1 (str):
column name 1.colname_2 (str):
column name 2.
Returns:
- filtered_df (pd.DataFrame):
DataFrame with subset rows removed.
Example:
>>> df = pd.DataFrame({
... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}],
... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}]
... })
>>> MatrixPatches.remove_subset_rows(df, "A", "B")
A B
0 {0, 1, 2} {3}
1 {1} {1, 2, 3}
2 {1, 2} {2, 3}
3 {3, 4} {1}
4 {4} {3}
539 @staticmethod 540 def is_subset( 541 row: pd.Series, 542 other_row: pd.Series, 543 colname_1: str, 544 colname_2: str, 545 ): 546 """Check if row is a subset of other_row for two specified columns. 547 548 Arguments: 549 550 - **row (pd.Series)**:<br /> 551 Row to check if it is a subset of other_row. 552 553 - **other_row (pd.Series)**:<br /> 554 Row to check against. 555 556 - **colname_1 (str)**:<br /> 557 Column name 1. 558 559 - **colname_2 (str)**:<br /> 560 Column name 2. 561 562 Returns: 563 564 - **(bool)**:<br /> 565 `True` if row is a subset of `other_row`, `False` otherwise. 566 567 Example: 568 569 >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}}) 570 >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}}) 571 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 572 True 573 >>> other_row = pd.Series({"A": {1, 2}, "B": {3}}) 574 >>> MatrixPatches.is_subset(row, other_row, "A", "B") 575 False 576 """ 577 578 return ( 579 row[colname_1].issubset(other_row[colname_1]) 580 and row[colname_2].issubset(other_row[colname_2]) 581 )
Check if row is a subset of other_row for two specified columns.
Arguments:
row (pd.Series):
Row to check if it is a subset of other_row.other_row (pd.Series):
Row to check against.colname_1 (str):
Column name 1.colname_2 (str):
Column name 2.
Returns:
- (bool):
Trueif row is a subset ofother_row,Falseotherwise.
Example:
>>> row = pd.Series({"A": {0, 1, 2}, "B": {3}})
>>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}})
>>> MatrixPatches.is_subset(row, other_row, "A", "B")
True
>>> other_row = pd.Series({"A": {1, 2}, "B": {3}})
>>> MatrixPatches.is_subset(row, other_row, "A", "B")
False