Edit on GitHub

matrix_patches

Class to get interacting patches from a binary matrix.

  1"""
  2[matrix_patches](https://github.com/isblab/af_pipeline/tree/main/af_pipeline/tools/matrix_patches.py)
  3==============================
  4
  5Class to get interacting patches from a binary matrix.
  6"""
  7
  8import numpy as np
  9import pandas as pd
 10from collections import defaultdict
 11from af_pipeline.utils.misc_utils import (
 12    get_key_from_res_range,
 13    get_res_range_from_key,
 14)
 15from af_pipeline.constants.af_constants import MiscStrEnum
 16
 17class MatrixPatches:
 18    """Class to get interacting patches from a binary matrix"""
 19
 20    matrix: np.ndarray
 21    """ Binary matrix where rows and columns represent different objects
 22    (e.g., chains in a protein complex). """
 23
 24    row_obj: str
 25    """ Identifier for the rows in the matrix. """
 26
 27    col_obj: str
 28    """ Identifier for the columns in the matrix. """
 29
 30    def __init__(
 31        self,
 32        matrix: np.ndarray,
 33        row_obj: str = "row_obj",
 34        col_obj: str = "col_obj",
 35    ):
 36        self.matrix = matrix
 37        self.row_obj = row_obj
 38        self.col_obj = col_obj
 39
 40    def get_patches_from_matrix(self):
 41        """Get all interacting patches from a binary matrix
 42
 43        Arguments:
 44
 45        - **matrix (np.ndarray)**:<br />
 46            Binary matrix where rows and columns represent different objects
 47            (e.g., chains in a protein complex).
 48
 49        - **row_obj (str)**:<br />
 50            Identifier for the rows in the matrix
 51
 52        - **col_obj (str)**:<br />
 53            Identifier for the columns in the matrix
 54
 55        Returns:
 56
 57        - **patches (dict)**:<br />
 58            Dictionary of interacting patches
 59
 60        Example:
 61
 62            >>> matrix = np.array([
 63            ... [0, 0, 0, 1],
 64            ... [0, 1, 1, 1],
 65            ... [0, 0, 1, 1],
 66            ... [0, 1, 0, 0],
 67            ... [0, 1, 0, 1]
 68            ... ])
 69            >>> matrix_patches = MatrixPatches(
 70            ... matrix, row_obj="A", col_obj="B"
 71            ... )
 72            >>> matrix_patches.get_patches_from_matrix()
 73                       A          B
 74            0  {0, 1, 2}        {3}
 75            1        {1}  {1, 2, 3}
 76            2     {1, 2}     {2, 3}
 77            3     {3, 4}        {1}
 78            4        {4}        {3}
 79        """
 80
 81        assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), (
 82            f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}"
 83        )
 84
 85        row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0)
 86        col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1)
 87
 88        split_row_sets = self.extend_one_sets_by_subsets(row_sets)
 89        split_col_sets = self.extend_one_sets_by_subsets(col_sets)
 90
 91        df_row = self.one_sets_to_df(
 92            split_row_sets, [self.row_obj, self.col_obj]
 93        )
 94        df_col = self.one_sets_to_df(
 95            split_col_sets, [self.col_obj, self.row_obj]
 96        )
 97
 98        df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj)
 99        df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj)
100
101        combined_df = self.combine_dfs(
102            df_row, df_col, self.row_obj, self.col_obj
103        )
104
105        for col in [self.row_obj, self.col_obj]:
106            combined_df[col] = combined_df[col].apply(
107                get_res_range_from_key, return_type="set"
108            )
109
110        combined_df = self.remove_subset_rows(
111            combined_df, self.row_obj, self.col_obj
112        )
113
114        return combined_df
115
116    @staticmethod
117    def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0):
118        """Get the indices of 1s in a binary matrix rowwise or columnwise.
119
120        Arguments:
121
122        - **matrix (np.ndarray)**:<br />
123            Binary matrix where rows and columns represent different objects
124            (e.g., chains in a protein complex).
125
126        - **axis (int, optional)**:<br />
127            0 for rowwise, 1 for columnwise.
128
129        Returns:
130
131        - **one_sets (dict)**:<br />
132            `{k:v}` where `v` is a set of indices of 1s for key `k`.
133
134        Example:
135
136            >>> matrix = np.array([
137            ... [1, 0, 1],
138            ... [0, 1, 0],
139            ... [1, 1, 0]
140            ... ])
141            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0)
142            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}}
143            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1)
144            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}}
145        """
146
147        assert np.isin(matrix, [0, 1]).all() and np.any(matrix), (
148            f"Matrix must be binary and non-empty, got {np.unique(matrix)}"
149        )
150
151        one_sets = {}
152
153        if axis == 0:  # row_sets
154            for i in range(matrix.shape[0]):
155                one_sets[i] = set(np.where(matrix[i] == 1)[0])
156
157        elif axis == 1:  # col_sets
158            for j in range(matrix.shape[1]):
159                one_sets[j] = set(np.where(matrix[:, j] == 1)[0])
160
161        return one_sets
162
163    @staticmethod
164    def extend_one_sets_by_subsets(one_sets: dict) -> dict:
165        """Add the subsets of the sets in list_of_sets to the one_sets.
166
167        Arguments:
168
169        - **one_sets (dict)**:<br />
170            `{k:v}` where `v` is a set of indices of 1s for key `k`.
171
172        Returns:
173
174        - **new_one_sets (dict)**:<br />
175            `{k:v}` where `v` is a list of sets of indices of 1s for key `k`
176            each set is a subset of the original set and is present in
177            the values of `one_sets`.
178
179        Example:
180
181            >>> one_sets = {
182            ... 0: {0, 1, 2, 3, 5, 6},
183            ... 1: {1},
184            ... 2: {0, 1}
185            ... }
186            >>> MatrixPatches.extend_one_sets_by_subsets(one_sets)
187            {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]}
188        """
189
190        split_sets = MatrixPatches.split_one_sets(one_sets)
191
192        new_one_sets = defaultdict(list)
193        list_of_sets = []  # unique sets from split_sets
194        list_of_sets = [
195            set(x)
196            for xs in split_sets.values()
197            for x in xs
198            if set(x) not in list_of_sets
199        ]
200
201        for set1 in list_of_sets:
202            for idx, one_set in one_sets.items():
203                if set1.issubset(one_set):
204                    (
205                        new_one_sets[idx].append(set1)
206                        if set1 not in new_one_sets[idx]
207                        else None
208                    )
209
210        return dict(new_one_sets)
211
212    @staticmethod
213    def split_one_sets(one_sets: dict) -> dict:
214        """Split the sets in `one_sets` into sub-sets such that
215        each subset only contains consecutive indices.
216
217        Arguments:
218
219        - **one_sets (dict)**:<br />
220            `{k:v}` where `v` is a set of indices of 1s for key `k`.
221
222        Returns:
223
224        - **new_one_sets (dict)**:<br />
225            dictionary of lists of lists where each list contains the
226            indices of 1s.
227
228        Example:
229
230            >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}}
231            >>> MatrixPatches.split_one_sets(one_sets)
232            {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]}
233        """
234
235        new_one_sets = {}
236
237        for i, one_set in one_sets.items():
238
239            if not isinstance(one_set, set):
240                raise TypeError("one_set must be a set")
241
242            sub_sets = MatrixPatches.split_one_set(one_set)
243            new_one_sets[i] = sub_sets
244
245        return new_one_sets
246
247    @staticmethod
248    def split_one_set(one_set: set | list) -> list:
249        """Split a set of indices into sub-sets such that
250        each subset only contains consecutive indices.
251
252        Arguments:
253
254        - **one_set (set | list)**:<br />
255            Set of indices of 1s.
256
257        Returns:
258
259        - **sub_sets (list)**:<br />
260            List of lists where each list contains the indices of 1s.
261
262        Example:
263
264            >>> one_set = {0, 1, 2, 3, 5, 6} \n
265            >>> MatrixPatches.split_one_set(one_set)
266            [[0, 1, 2, 3], [5, 6]]
267        """
268
269        assert isinstance(
270            one_set, set | list
271        ), "one_set must be a set or a list"
272
273        sub_sets = []
274
275        if isinstance(one_set, list):
276            # need to remove duplicates if any
277            one_set = set(one_set)
278
279        one_set = sorted(list(one_set))
280
281        for idx, one_pos in enumerate(one_set):
282
283            curr_pos = one_pos
284            prev_pos = one_set[idx - 1] if idx > 0 else None
285
286            if idx == 0:
287                # If it's the first position, create a new sub-set
288                sub_sets.append([curr_pos])
289
290            elif curr_pos - prev_pos == 1:
291                # If the current position is consecutive to the previous one
292                # add it to the last sub-set
293                sub_sets[-1].append(one_pos)
294
295            else:
296                # If the current position is not consecutive to the previous one
297                # create a new sub-set
298                sub_sets.append([curr_pos])
299
300        return sub_sets
301
302    @staticmethod
303    def one_sets_to_df(
304        one_sets: dict,
305        columns: list
306    ):
307        """Convert a dictionary to a pandas DataFrame.
308
309        Arguments:
310
311        - **one_sets (dict)**:<br />
312            Dictionary to convert.
313
314        - **columns (list)**:<br />
315            Column names.
316
317        Returns:
318
319        - df (pd.DataFrame)**:<br />
320            DataFrame with the dictionary keys as first column and values
321            as second column in columns.
322
323        Example:
324
325            >>> one_sets = {
326            ... 1: [{1, 2}, {5}],
327            ... 2: [{4, 5}, {6}]
328            ... }
329            >>> columns = ["A", "B"]
330            >>> MatrixPatches.one_sets_to_df(one_sets, columns)
331               A       B
332            0  1  {1, 2}
333            1  1     {5}
334            2  2  {4, 5}
335            3  2     {6}
336        """
337
338        if all([isinstance(val, list) for val in one_sets.values()]):
339
340            df_rows = []
341
342            for k, v in one_sets.items():
343                for val in v:
344                    df_rows.append([str(k), val])
345
346            df = pd.DataFrame(df_rows, columns=columns)
347
348        else:
349            raise ValueError("All values in the dictionary must be lists.")
350
351        return df
352
353    @staticmethod
354    def aggregate_df_rows(
355        df: pd.DataFrame,
356        groupby_col: str,
357        agg_col: str
358    ):
359        """Group a DataFrame by a column and aggregate another column.
360
361        Arguments:
362
363        - **df (pd.DataFrame)**:<br />
364            DataFrame with groupby_col and agg_col.
365
366        - **groupby_col (str)**:<br />
367            Column to group by (each value is a set).
368
369        - **agg_col (str)**:<br />
370            Column to aggregate (each value is a string).
371
372        Returns:
373
374        - **df_group (pd.DataFrame)**:<br />
375            Grouped DataFrame with both columns as a set.
376
377        Example:
378
379            >>> df = pd.DataFrame({
380            ... "A": ["1", "1", "1", "2", "3", "4"],
381            ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}]
382            ... })
383            >>> MatrixPatches.aggregate_df_rows(df, "B", "A")
384                    B          A
385            0     {1}        {1}
386            1  {1, 2}  {1, 3, 4}
387            2  {4, 5}        {2}
388            3     {5}        {1}
389        """
390
391        df_group = (
392            df.groupby(df[groupby_col].map(tuple))[agg_col]
393            .apply(",".join)
394            .reset_index()
395        )
396        df_group[agg_col] = df_group[agg_col].astype(object)
397        for idx, row in df_group.iterrows():
398            one_set = row[agg_col].split(",")
399            one_set = [int(x) for x in one_set]
400            one_set = sorted(one_set)
401            df_group.at[idx, agg_col] = set(one_set)
402
403        df_group[groupby_col] = df_group[groupby_col].apply(set)
404
405        return df_group
406
407    @staticmethod
408    def combine_dfs(
409        df1: pd.DataFrame,
410        df2: pd.DataFrame,
411        colname_1: str,
412        colname_2: str
413    ):
414        """Combine two DataFrames with columns colname_1 and colname_2
415        into a new DataFrame with interacting residues ranges without duplicates.
416
417        Arguments:
418
419        - **df1 (pd.DataFrame)**:<br />
420            DataFrame 1.
421
422        - **df2 (pd.DataFrame)**:<br />
423            DataFrame 2.
424
425        - **colname_1 (str)**:<br />
426            Column name 1.
427
428        - **colname_2 (str)**:<br />
429            Column name 2.
430
431        Returns:
432
433        - **new_df (pd.DataFrame)**:<br />
434            Combined DataFrame of interacting residues ranges without
435            duplicates.
436
437        Example:
438
439            >>> df1 = pd.DataFrame({
440            ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}],
441            ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}]
442            ... })
443            >>> df2 = pd.DataFrame({
444            ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}],
445            ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}]
446            ... })
447
448            >>> MatrixPatches.combine_dfs(df1, df2, "A", "B")
449                 A    B
450            0  0-2    3
451            1    1  1-3
452            2  1-2  2-3
453            3  3-4    1
454            4    4    1
455            5    4    3
456            6    1    1
457        """
458
459        combined_df = pd.concat([df2, df1], axis=0)
460        combined_df.reset_index(drop=True, inplace=True)
461
462        df_rows = []
463
464        for _, row in combined_df.iterrows():
465            if isinstance(row[colname_1], set) and isinstance(
466                row[colname_2], set
467            ):
468                ranges1 = get_key_from_res_range(row[colname_1], as_list=True)
469                ranges2 = get_key_from_res_range(row[colname_2], as_list=True)
470                assert isinstance(ranges1, list) and isinstance(ranges2, list)
471                for res_range1 in ranges1:
472                    for res_range2 in ranges2:
473                        df_rows.append([res_range1, res_range2])
474
475        new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2])
476        new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST)
477        new_df.reset_index(drop=True, inplace=True)
478
479        return new_df
480
481    @staticmethod
482    def remove_subset_rows(
483        df: pd.DataFrame,
484        colname_1: str,
485        colname_2: str
486    ):
487        """Remove rows that are subsets of other rows.
488        (from chatgpt)
489
490        Arguments:
491
492        - **df (pd.DataFrame)**:<br />
493            DataFrame with columns `colname_1` and `colname_2`.
494
495        - **colname_1 (str)**:<br />
496            column name 1.
497
498        - **colname_2 (str)**:<br />
499            column name 2.
500
501        Returns:
502
503        - **filtered_df (pd.DataFrame)**:<br />
504            DataFrame with subset rows removed.
505
506        Example:
507
508        >>> df = pd.DataFrame({
509        ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}],
510        ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}]
511        ... })
512        >>> MatrixPatches.remove_subset_rows(df, "A", "B")
513                   A          B
514        0  {0, 1, 2}        {3}
515        1        {1}  {1, 2, 3}
516        2     {1, 2}     {2, 3}
517        3     {3, 4}        {1}
518        4        {4}        {3}
519        """
520
521        rows_to_keep = []
522
523        for i, row in df.iterrows():
524
525            if not any(
526                MatrixPatches.is_subset(
527                    row, df.iloc[j], colname_1, colname_2
528                )
529                for j in range(len(df))
530                if i != j
531            ):
532                rows_to_keep.append(i)
533
534        filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
535
536        return filtered_df
537
538    @staticmethod
539    def is_subset(
540        row: pd.Series,
541        other_row: pd.Series,
542        colname_1: str,
543        colname_2: str,
544    ):
545        """Check if row is a subset of other_row for two specified columns.
546
547        Arguments:
548
549        - **row (pd.Series)**:<br />
550            Row to check if it is a subset of other_row.
551
552        - **other_row (pd.Series)**:<br />
553            Row to check against.
554
555        - **colname_1 (str)**:<br />
556            Column name 1.
557
558        - **colname_2 (str)**:<br />
559            Column name 2.
560
561        Returns:
562
563        - **(bool)**:<br />
564            `True` if row is a subset of `other_row`, `False` otherwise.
565
566        Example:
567
568            >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}})
569            >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}})
570            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
571            True
572            >>> other_row = pd.Series({"A": {1, 2}, "B": {3}})
573            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
574            False
575        """
576
577        return (
578            row[colname_1].issubset(other_row[colname_1])
579            and row[colname_2].issubset(other_row[colname_2])
580        )
581
582
583if __name__ == "__main__":
584
585    import doctest
586    doctest.testmod()
class MatrixPatches:
 18class MatrixPatches:
 19    """Class to get interacting patches from a binary matrix"""
 20
 21    matrix: np.ndarray
 22    """ Binary matrix where rows and columns represent different objects
 23    (e.g., chains in a protein complex). """
 24
 25    row_obj: str
 26    """ Identifier for the rows in the matrix. """
 27
 28    col_obj: str
 29    """ Identifier for the columns in the matrix. """
 30
 31    def __init__(
 32        self,
 33        matrix: np.ndarray,
 34        row_obj: str = "row_obj",
 35        col_obj: str = "col_obj",
 36    ):
 37        self.matrix = matrix
 38        self.row_obj = row_obj
 39        self.col_obj = col_obj
 40
 41    def get_patches_from_matrix(self):
 42        """Get all interacting patches from a binary matrix
 43
 44        Arguments:
 45
 46        - **matrix (np.ndarray)**:<br />
 47            Binary matrix where rows and columns represent different objects
 48            (e.g., chains in a protein complex).
 49
 50        - **row_obj (str)**:<br />
 51            Identifier for the rows in the matrix
 52
 53        - **col_obj (str)**:<br />
 54            Identifier for the columns in the matrix
 55
 56        Returns:
 57
 58        - **patches (dict)**:<br />
 59            Dictionary of interacting patches
 60
 61        Example:
 62
 63            >>> matrix = np.array([
 64            ... [0, 0, 0, 1],
 65            ... [0, 1, 1, 1],
 66            ... [0, 0, 1, 1],
 67            ... [0, 1, 0, 0],
 68            ... [0, 1, 0, 1]
 69            ... ])
 70            >>> matrix_patches = MatrixPatches(
 71            ... matrix, row_obj="A", col_obj="B"
 72            ... )
 73            >>> matrix_patches.get_patches_from_matrix()
 74                       A          B
 75            0  {0, 1, 2}        {3}
 76            1        {1}  {1, 2, 3}
 77            2     {1, 2}     {2, 3}
 78            3     {3, 4}        {1}
 79            4        {4}        {3}
 80        """
 81
 82        assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), (
 83            f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}"
 84        )
 85
 86        row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0)
 87        col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1)
 88
 89        split_row_sets = self.extend_one_sets_by_subsets(row_sets)
 90        split_col_sets = self.extend_one_sets_by_subsets(col_sets)
 91
 92        df_row = self.one_sets_to_df(
 93            split_row_sets, [self.row_obj, self.col_obj]
 94        )
 95        df_col = self.one_sets_to_df(
 96            split_col_sets, [self.col_obj, self.row_obj]
 97        )
 98
 99        df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj)
100        df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj)
101
102        combined_df = self.combine_dfs(
103            df_row, df_col, self.row_obj, self.col_obj
104        )
105
106        for col in [self.row_obj, self.col_obj]:
107            combined_df[col] = combined_df[col].apply(
108                get_res_range_from_key, return_type="set"
109            )
110
111        combined_df = self.remove_subset_rows(
112            combined_df, self.row_obj, self.col_obj
113        )
114
115        return combined_df
116
117    @staticmethod
118    def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0):
119        """Get the indices of 1s in a binary matrix rowwise or columnwise.
120
121        Arguments:
122
123        - **matrix (np.ndarray)**:<br />
124            Binary matrix where rows and columns represent different objects
125            (e.g., chains in a protein complex).
126
127        - **axis (int, optional)**:<br />
128            0 for rowwise, 1 for columnwise.
129
130        Returns:
131
132        - **one_sets (dict)**:<br />
133            `{k:v}` where `v` is a set of indices of 1s for key `k`.
134
135        Example:
136
137            >>> matrix = np.array([
138            ... [1, 0, 1],
139            ... [0, 1, 0],
140            ... [1, 1, 0]
141            ... ])
142            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0)
143            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}}
144            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1)
145            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}}
146        """
147
148        assert np.isin(matrix, [0, 1]).all() and np.any(matrix), (
149            f"Matrix must be binary and non-empty, got {np.unique(matrix)}"
150        )
151
152        one_sets = {}
153
154        if axis == 0:  # row_sets
155            for i in range(matrix.shape[0]):
156                one_sets[i] = set(np.where(matrix[i] == 1)[0])
157
158        elif axis == 1:  # col_sets
159            for j in range(matrix.shape[1]):
160                one_sets[j] = set(np.where(matrix[:, j] == 1)[0])
161
162        return one_sets
163
164    @staticmethod
165    def extend_one_sets_by_subsets(one_sets: dict) -> dict:
166        """Add the subsets of the sets in list_of_sets to the one_sets.
167
168        Arguments:
169
170        - **one_sets (dict)**:<br />
171            `{k:v}` where `v` is a set of indices of 1s for key `k`.
172
173        Returns:
174
175        - **new_one_sets (dict)**:<br />
176            `{k:v}` where `v` is a list of sets of indices of 1s for key `k`
177            each set is a subset of the original set and is present in
178            the values of `one_sets`.
179
180        Example:
181
182            >>> one_sets = {
183            ... 0: {0, 1, 2, 3, 5, 6},
184            ... 1: {1},
185            ... 2: {0, 1}
186            ... }
187            >>> MatrixPatches.extend_one_sets_by_subsets(one_sets)
188            {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]}
189        """
190
191        split_sets = MatrixPatches.split_one_sets(one_sets)
192
193        new_one_sets = defaultdict(list)
194        list_of_sets = []  # unique sets from split_sets
195        list_of_sets = [
196            set(x)
197            for xs in split_sets.values()
198            for x in xs
199            if set(x) not in list_of_sets
200        ]
201
202        for set1 in list_of_sets:
203            for idx, one_set in one_sets.items():
204                if set1.issubset(one_set):
205                    (
206                        new_one_sets[idx].append(set1)
207                        if set1 not in new_one_sets[idx]
208                        else None
209                    )
210
211        return dict(new_one_sets)
212
213    @staticmethod
214    def split_one_sets(one_sets: dict) -> dict:
215        """Split the sets in `one_sets` into sub-sets such that
216        each subset only contains consecutive indices.
217
218        Arguments:
219
220        - **one_sets (dict)**:<br />
221            `{k:v}` where `v` is a set of indices of 1s for key `k`.
222
223        Returns:
224
225        - **new_one_sets (dict)**:<br />
226            dictionary of lists of lists where each list contains the
227            indices of 1s.
228
229        Example:
230
231            >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}}
232            >>> MatrixPatches.split_one_sets(one_sets)
233            {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]}
234        """
235
236        new_one_sets = {}
237
238        for i, one_set in one_sets.items():
239
240            if not isinstance(one_set, set):
241                raise TypeError("one_set must be a set")
242
243            sub_sets = MatrixPatches.split_one_set(one_set)
244            new_one_sets[i] = sub_sets
245
246        return new_one_sets
247
248    @staticmethod
249    def split_one_set(one_set: set | list) -> list:
250        """Split a set of indices into sub-sets such that
251        each subset only contains consecutive indices.
252
253        Arguments:
254
255        - **one_set (set | list)**:<br />
256            Set of indices of 1s.
257
258        Returns:
259
260        - **sub_sets (list)**:<br />
261            List of lists where each list contains the indices of 1s.
262
263        Example:
264
265            >>> one_set = {0, 1, 2, 3, 5, 6} \n
266            >>> MatrixPatches.split_one_set(one_set)
267            [[0, 1, 2, 3], [5, 6]]
268        """
269
270        assert isinstance(
271            one_set, set | list
272        ), "one_set must be a set or a list"
273
274        sub_sets = []
275
276        if isinstance(one_set, list):
277            # need to remove duplicates if any
278            one_set = set(one_set)
279
280        one_set = sorted(list(one_set))
281
282        for idx, one_pos in enumerate(one_set):
283
284            curr_pos = one_pos
285            prev_pos = one_set[idx - 1] if idx > 0 else None
286
287            if idx == 0:
288                # If it's the first position, create a new sub-set
289                sub_sets.append([curr_pos])
290
291            elif curr_pos - prev_pos == 1:
292                # If the current position is consecutive to the previous one
293                # add it to the last sub-set
294                sub_sets[-1].append(one_pos)
295
296            else:
297                # If the current position is not consecutive to the previous one
298                # create a new sub-set
299                sub_sets.append([curr_pos])
300
301        return sub_sets
302
303    @staticmethod
304    def one_sets_to_df(
305        one_sets: dict,
306        columns: list
307    ):
308        """Convert a dictionary to a pandas DataFrame.
309
310        Arguments:
311
312        - **one_sets (dict)**:<br />
313            Dictionary to convert.
314
315        - **columns (list)**:<br />
316            Column names.
317
318        Returns:
319
320        - df (pd.DataFrame)**:<br />
321            DataFrame with the dictionary keys as first column and values
322            as second column in columns.
323
324        Example:
325
326            >>> one_sets = {
327            ... 1: [{1, 2}, {5}],
328            ... 2: [{4, 5}, {6}]
329            ... }
330            >>> columns = ["A", "B"]
331            >>> MatrixPatches.one_sets_to_df(one_sets, columns)
332               A       B
333            0  1  {1, 2}
334            1  1     {5}
335            2  2  {4, 5}
336            3  2     {6}
337        """
338
339        if all([isinstance(val, list) for val in one_sets.values()]):
340
341            df_rows = []
342
343            for k, v in one_sets.items():
344                for val in v:
345                    df_rows.append([str(k), val])
346
347            df = pd.DataFrame(df_rows, columns=columns)
348
349        else:
350            raise ValueError("All values in the dictionary must be lists.")
351
352        return df
353
354    @staticmethod
355    def aggregate_df_rows(
356        df: pd.DataFrame,
357        groupby_col: str,
358        agg_col: str
359    ):
360        """Group a DataFrame by a column and aggregate another column.
361
362        Arguments:
363
364        - **df (pd.DataFrame)**:<br />
365            DataFrame with groupby_col and agg_col.
366
367        - **groupby_col (str)**:<br />
368            Column to group by (each value is a set).
369
370        - **agg_col (str)**:<br />
371            Column to aggregate (each value is a string).
372
373        Returns:
374
375        - **df_group (pd.DataFrame)**:<br />
376            Grouped DataFrame with both columns as a set.
377
378        Example:
379
380            >>> df = pd.DataFrame({
381            ... "A": ["1", "1", "1", "2", "3", "4"],
382            ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}]
383            ... })
384            >>> MatrixPatches.aggregate_df_rows(df, "B", "A")
385                    B          A
386            0     {1}        {1}
387            1  {1, 2}  {1, 3, 4}
388            2  {4, 5}        {2}
389            3     {5}        {1}
390        """
391
392        df_group = (
393            df.groupby(df[groupby_col].map(tuple))[agg_col]
394            .apply(",".join)
395            .reset_index()
396        )
397        df_group[agg_col] = df_group[agg_col].astype(object)
398        for idx, row in df_group.iterrows():
399            one_set = row[agg_col].split(",")
400            one_set = [int(x) for x in one_set]
401            one_set = sorted(one_set)
402            df_group.at[idx, agg_col] = set(one_set)
403
404        df_group[groupby_col] = df_group[groupby_col].apply(set)
405
406        return df_group
407
408    @staticmethod
409    def combine_dfs(
410        df1: pd.DataFrame,
411        df2: pd.DataFrame,
412        colname_1: str,
413        colname_2: str
414    ):
415        """Combine two DataFrames with columns colname_1 and colname_2
416        into a new DataFrame with interacting residues ranges without duplicates.
417
418        Arguments:
419
420        - **df1 (pd.DataFrame)**:<br />
421            DataFrame 1.
422
423        - **df2 (pd.DataFrame)**:<br />
424            DataFrame 2.
425
426        - **colname_1 (str)**:<br />
427            Column name 1.
428
429        - **colname_2 (str)**:<br />
430            Column name 2.
431
432        Returns:
433
434        - **new_df (pd.DataFrame)**:<br />
435            Combined DataFrame of interacting residues ranges without
436            duplicates.
437
438        Example:
439
440            >>> df1 = pd.DataFrame({
441            ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}],
442            ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}]
443            ... })
444            >>> df2 = pd.DataFrame({
445            ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}],
446            ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}]
447            ... })
448
449            >>> MatrixPatches.combine_dfs(df1, df2, "A", "B")
450                 A    B
451            0  0-2    3
452            1    1  1-3
453            2  1-2  2-3
454            3  3-4    1
455            4    4    1
456            5    4    3
457            6    1    1
458        """
459
460        combined_df = pd.concat([df2, df1], axis=0)
461        combined_df.reset_index(drop=True, inplace=True)
462
463        df_rows = []
464
465        for _, row in combined_df.iterrows():
466            if isinstance(row[colname_1], set) and isinstance(
467                row[colname_2], set
468            ):
469                ranges1 = get_key_from_res_range(row[colname_1], as_list=True)
470                ranges2 = get_key_from_res_range(row[colname_2], as_list=True)
471                assert isinstance(ranges1, list) and isinstance(ranges2, list)
472                for res_range1 in ranges1:
473                    for res_range2 in ranges2:
474                        df_rows.append([res_range1, res_range2])
475
476        new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2])
477        new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST)
478        new_df.reset_index(drop=True, inplace=True)
479
480        return new_df
481
482    @staticmethod
483    def remove_subset_rows(
484        df: pd.DataFrame,
485        colname_1: str,
486        colname_2: str
487    ):
488        """Remove rows that are subsets of other rows.
489        (from chatgpt)
490
491        Arguments:
492
493        - **df (pd.DataFrame)**:<br />
494            DataFrame with columns `colname_1` and `colname_2`.
495
496        - **colname_1 (str)**:<br />
497            column name 1.
498
499        - **colname_2 (str)**:<br />
500            column name 2.
501
502        Returns:
503
504        - **filtered_df (pd.DataFrame)**:<br />
505            DataFrame with subset rows removed.
506
507        Example:
508
509        >>> df = pd.DataFrame({
510        ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}],
511        ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}]
512        ... })
513        >>> MatrixPatches.remove_subset_rows(df, "A", "B")
514                   A          B
515        0  {0, 1, 2}        {3}
516        1        {1}  {1, 2, 3}
517        2     {1, 2}     {2, 3}
518        3     {3, 4}        {1}
519        4        {4}        {3}
520        """
521
522        rows_to_keep = []
523
524        for i, row in df.iterrows():
525
526            if not any(
527                MatrixPatches.is_subset(
528                    row, df.iloc[j], colname_1, colname_2
529                )
530                for j in range(len(df))
531                if i != j
532            ):
533                rows_to_keep.append(i)
534
535        filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
536
537        return filtered_df
538
539    @staticmethod
540    def is_subset(
541        row: pd.Series,
542        other_row: pd.Series,
543        colname_1: str,
544        colname_2: str,
545    ):
546        """Check if row is a subset of other_row for two specified columns.
547
548        Arguments:
549
550        - **row (pd.Series)**:<br />
551            Row to check if it is a subset of other_row.
552
553        - **other_row (pd.Series)**:<br />
554            Row to check against.
555
556        - **colname_1 (str)**:<br />
557            Column name 1.
558
559        - **colname_2 (str)**:<br />
560            Column name 2.
561
562        Returns:
563
564        - **(bool)**:<br />
565            `True` if row is a subset of `other_row`, `False` otherwise.
566
567        Example:
568
569            >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}})
570            >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}})
571            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
572            True
573            >>> other_row = pd.Series({"A": {1, 2}, "B": {3}})
574            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
575            False
576        """
577
578        return (
579            row[colname_1].issubset(other_row[colname_1])
580            and row[colname_2].issubset(other_row[colname_2])
581        )

Class to get interacting patches from a binary matrix

MatrixPatches( matrix: numpy.ndarray, row_obj: str = 'row_obj', col_obj: str = 'col_obj')
31    def __init__(
32        self,
33        matrix: np.ndarray,
34        row_obj: str = "row_obj",
35        col_obj: str = "col_obj",
36    ):
37        self.matrix = matrix
38        self.row_obj = row_obj
39        self.col_obj = col_obj
matrix: numpy.ndarray

Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).

row_obj: str

Identifier for the rows in the matrix.

col_obj: str

Identifier for the columns in the matrix.

def get_patches_from_matrix(self):
 41    def get_patches_from_matrix(self):
 42        """Get all interacting patches from a binary matrix
 43
 44        Arguments:
 45
 46        - **matrix (np.ndarray)**:<br />
 47            Binary matrix where rows and columns represent different objects
 48            (e.g., chains in a protein complex).
 49
 50        - **row_obj (str)**:<br />
 51            Identifier for the rows in the matrix
 52
 53        - **col_obj (str)**:<br />
 54            Identifier for the columns in the matrix
 55
 56        Returns:
 57
 58        - **patches (dict)**:<br />
 59            Dictionary of interacting patches
 60
 61        Example:
 62
 63            >>> matrix = np.array([
 64            ... [0, 0, 0, 1],
 65            ... [0, 1, 1, 1],
 66            ... [0, 0, 1, 1],
 67            ... [0, 1, 0, 0],
 68            ... [0, 1, 0, 1]
 69            ... ])
 70            >>> matrix_patches = MatrixPatches(
 71            ... matrix, row_obj="A", col_obj="B"
 72            ... )
 73            >>> matrix_patches.get_patches_from_matrix()
 74                       A          B
 75            0  {0, 1, 2}        {3}
 76            1        {1}  {1, 2, 3}
 77            2     {1, 2}     {2, 3}
 78            3     {3, 4}        {1}
 79            4        {4}        {3}
 80        """
 81
 82        assert np.isin(self.matrix, [0, 1]).all() and np.any(self.matrix), (
 83            f"Matrix must be binary and non-empty, got {np.unique(self.matrix)}"
 84        )
 85
 86        row_sets = self.get_one_sets_from_matrix(self.matrix, axis=0)
 87        col_sets = self.get_one_sets_from_matrix(self.matrix, axis=1)
 88
 89        split_row_sets = self.extend_one_sets_by_subsets(row_sets)
 90        split_col_sets = self.extend_one_sets_by_subsets(col_sets)
 91
 92        df_row = self.one_sets_to_df(
 93            split_row_sets, [self.row_obj, self.col_obj]
 94        )
 95        df_col = self.one_sets_to_df(
 96            split_col_sets, [self.col_obj, self.row_obj]
 97        )
 98
 99        df_row = self.aggregate_df_rows(df_row, self.col_obj, self.row_obj)
100        df_col = self.aggregate_df_rows(df_col, self.row_obj, self.col_obj)
101
102        combined_df = self.combine_dfs(
103            df_row, df_col, self.row_obj, self.col_obj
104        )
105
106        for col in [self.row_obj, self.col_obj]:
107            combined_df[col] = combined_df[col].apply(
108                get_res_range_from_key, return_type="set"
109            )
110
111        combined_df = self.remove_subset_rows(
112            combined_df, self.row_obj, self.col_obj
113        )
114
115        return combined_df

Get all interacting patches from a binary matrix

Arguments:

  • matrix (np.ndarray):
    Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).

  • row_obj (str):
    Identifier for the rows in the matrix

  • col_obj (str):
    Identifier for the columns in the matrix

Returns:

  • patches (dict):
    Dictionary of interacting patches

Example:

>>> matrix = np.array([
... [0, 0, 0, 1],
... [0, 1, 1, 1],
... [0, 0, 1, 1],
... [0, 1, 0, 0],
... [0, 1, 0, 1]
... ])
>>> matrix_patches = MatrixPatches(
... matrix, row_obj="A", col_obj="B"
... )
>>> matrix_patches.get_patches_from_matrix()
           A          B
0  {0, 1, 2}        {3}
1        {1}  {1, 2, 3}
2     {1, 2}     {2, 3}
3     {3, 4}        {1}
4        {4}        {3}
@staticmethod
def get_one_sets_from_matrix(matrix: numpy.ndarray, axis: int = 0):
117    @staticmethod
118    def get_one_sets_from_matrix(matrix: np.ndarray, axis: int = 0):
119        """Get the indices of 1s in a binary matrix rowwise or columnwise.
120
121        Arguments:
122
123        - **matrix (np.ndarray)**:<br />
124            Binary matrix where rows and columns represent different objects
125            (e.g., chains in a protein complex).
126
127        - **axis (int, optional)**:<br />
128            0 for rowwise, 1 for columnwise.
129
130        Returns:
131
132        - **one_sets (dict)**:<br />
133            `{k:v}` where `v` is a set of indices of 1s for key `k`.
134
135        Example:
136
137            >>> matrix = np.array([
138            ... [1, 0, 1],
139            ... [0, 1, 0],
140            ... [1, 1, 0]
141            ... ])
142            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0)
143            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}}
144            >>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1)
145            {0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}}
146        """
147
148        assert np.isin(matrix, [0, 1]).all() and np.any(matrix), (
149            f"Matrix must be binary and non-empty, got {np.unique(matrix)}"
150        )
151
152        one_sets = {}
153
154        if axis == 0:  # row_sets
155            for i in range(matrix.shape[0]):
156                one_sets[i] = set(np.where(matrix[i] == 1)[0])
157
158        elif axis == 1:  # col_sets
159            for j in range(matrix.shape[1]):
160                one_sets[j] = set(np.where(matrix[:, j] == 1)[0])
161
162        return one_sets

Get the indices of 1s in a binary matrix rowwise or columnwise.

Arguments:

  • matrix (np.ndarray):
    Binary matrix where rows and columns represent different objects (e.g., chains in a protein complex).

  • axis (int, optional):
    0 for rowwise, 1 for columnwise.

Returns:

  • one_sets (dict):
    {k:v} where v is a set of indices of 1s for key k.

Example:

>>> matrix = np.array([
... [1, 0, 1],
... [0, 1, 0],
... [1, 1, 0]
... ])
>>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=0)
{0: {np.int64(0), np.int64(2)}, 1: {np.int64(1)}, 2: {np.int64(0), np.int64(1)}}
>>> MatrixPatches.get_one_sets_from_matrix(matrix=matrix, axis=1)
{0: {np.int64(0), np.int64(2)}, 1: {np.int64(1), np.int64(2)}, 2: {np.int64(0)}}
@staticmethod
def extend_one_sets_by_subsets(one_sets: dict) -> dict:
164    @staticmethod
165    def extend_one_sets_by_subsets(one_sets: dict) -> dict:
166        """Add the subsets of the sets in list_of_sets to the one_sets.
167
168        Arguments:
169
170        - **one_sets (dict)**:<br />
171            `{k:v}` where `v` is a set of indices of 1s for key `k`.
172
173        Returns:
174
175        - **new_one_sets (dict)**:<br />
176            `{k:v}` where `v` is a list of sets of indices of 1s for key `k`
177            each set is a subset of the original set and is present in
178            the values of `one_sets`.
179
180        Example:
181
182            >>> one_sets = {
183            ... 0: {0, 1, 2, 3, 5, 6},
184            ... 1: {1},
185            ... 2: {0, 1}
186            ... }
187            >>> MatrixPatches.extend_one_sets_by_subsets(one_sets)
188            {0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]}
189        """
190
191        split_sets = MatrixPatches.split_one_sets(one_sets)
192
193        new_one_sets = defaultdict(list)
194        list_of_sets = []  # unique sets from split_sets
195        list_of_sets = [
196            set(x)
197            for xs in split_sets.values()
198            for x in xs
199            if set(x) not in list_of_sets
200        ]
201
202        for set1 in list_of_sets:
203            for idx, one_set in one_sets.items():
204                if set1.issubset(one_set):
205                    (
206                        new_one_sets[idx].append(set1)
207                        if set1 not in new_one_sets[idx]
208                        else None
209                    )
210
211        return dict(new_one_sets)

Add the subsets of the sets in list_of_sets to the one_sets.

Arguments:

  • one_sets (dict):
    {k:v} where v is a set of indices of 1s for key k.

Returns:

  • new_one_sets (dict):
    {k:v} where v is a list of sets of indices of 1s for key k each set is a subset of the original set and is present in the values of one_sets.

Example:

>>> one_sets = {
... 0: {0, 1, 2, 3, 5, 6},
... 1: {1},
... 2: {0, 1}
... }
>>> MatrixPatches.extend_one_sets_by_subsets(one_sets)
{0: [{0, 1, 2, 3}, {5, 6}, {1}, {0, 1}], 1: [{1}], 2: [{1}, {0, 1}]}
@staticmethod
def split_one_sets(one_sets: dict) -> dict:
213    @staticmethod
214    def split_one_sets(one_sets: dict) -> dict:
215        """Split the sets in `one_sets` into sub-sets such that
216        each subset only contains consecutive indices.
217
218        Arguments:
219
220        - **one_sets (dict)**:<br />
221            `{k:v}` where `v` is a set of indices of 1s for key `k`.
222
223        Returns:
224
225        - **new_one_sets (dict)**:<br />
226            dictionary of lists of lists where each list contains the
227            indices of 1s.
228
229        Example:
230
231            >>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}}
232            >>> MatrixPatches.split_one_sets(one_sets)
233            {0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]}
234        """
235
236        new_one_sets = {}
237
238        for i, one_set in one_sets.items():
239
240            if not isinstance(one_set, set):
241                raise TypeError("one_set must be a set")
242
243            sub_sets = MatrixPatches.split_one_set(one_set)
244            new_one_sets[i] = sub_sets
245
246        return new_one_sets

Split the sets in one_sets into sub-sets such that each subset only contains consecutive indices.

Arguments:

  • one_sets (dict):
    {k:v} where v is a set of indices of 1s for key k.

Returns:

  • new_one_sets (dict):
    dictionary of lists of lists where each list contains the indices of 1s.

Example:

>>> one_sets = {0: {0, 1, 2, 3, 5, 6}, 1: {1}, 2: {0, 1}}
>>> MatrixPatches.split_one_sets(one_sets)
{0: [[0, 1, 2, 3], [5, 6]], 1: [[1]], 2: [[0, 1]]}
@staticmethod
def split_one_set(one_set: set | list) -> list:
248    @staticmethod
249    def split_one_set(one_set: set | list) -> list:
250        """Split a set of indices into sub-sets such that
251        each subset only contains consecutive indices.
252
253        Arguments:
254
255        - **one_set (set | list)**:<br />
256            Set of indices of 1s.
257
258        Returns:
259
260        - **sub_sets (list)**:<br />
261            List of lists where each list contains the indices of 1s.
262
263        Example:
264
265            >>> one_set = {0, 1, 2, 3, 5, 6} \n
266            >>> MatrixPatches.split_one_set(one_set)
267            [[0, 1, 2, 3], [5, 6]]
268        """
269
270        assert isinstance(
271            one_set, set | list
272        ), "one_set must be a set or a list"
273
274        sub_sets = []
275
276        if isinstance(one_set, list):
277            # need to remove duplicates if any
278            one_set = set(one_set)
279
280        one_set = sorted(list(one_set))
281
282        for idx, one_pos in enumerate(one_set):
283
284            curr_pos = one_pos
285            prev_pos = one_set[idx - 1] if idx > 0 else None
286
287            if idx == 0:
288                # If it's the first position, create a new sub-set
289                sub_sets.append([curr_pos])
290
291            elif curr_pos - prev_pos == 1:
292                # If the current position is consecutive to the previous one
293                # add it to the last sub-set
294                sub_sets[-1].append(one_pos)
295
296            else:
297                # If the current position is not consecutive to the previous one
298                # create a new sub-set
299                sub_sets.append([curr_pos])
300
301        return sub_sets

Split a set of indices into sub-sets such that each subset only contains consecutive indices.

Arguments:

  • one_set (set | list):
    Set of indices of 1s.

Returns:

  • sub_sets (list):
    List of lists where each list contains the indices of 1s.

Example:

>>> one_set = {0, 1, 2, 3, 5, 6} 

>>> MatrixPatches.split_one_set(one_set)
[[0, 1, 2, 3], [5, 6]]
@staticmethod
def one_sets_to_df(one_sets: dict, columns: list):
303    @staticmethod
304    def one_sets_to_df(
305        one_sets: dict,
306        columns: list
307    ):
308        """Convert a dictionary to a pandas DataFrame.
309
310        Arguments:
311
312        - **one_sets (dict)**:<br />
313            Dictionary to convert.
314
315        - **columns (list)**:<br />
316            Column names.
317
318        Returns:
319
320        - df (pd.DataFrame)**:<br />
321            DataFrame with the dictionary keys as first column and values
322            as second column in columns.
323
324        Example:
325
326            >>> one_sets = {
327            ... 1: [{1, 2}, {5}],
328            ... 2: [{4, 5}, {6}]
329            ... }
330            >>> columns = ["A", "B"]
331            >>> MatrixPatches.one_sets_to_df(one_sets, columns)
332               A       B
333            0  1  {1, 2}
334            1  1     {5}
335            2  2  {4, 5}
336            3  2     {6}
337        """
338
339        if all([isinstance(val, list) for val in one_sets.values()]):
340
341            df_rows = []
342
343            for k, v in one_sets.items():
344                for val in v:
345                    df_rows.append([str(k), val])
346
347            df = pd.DataFrame(df_rows, columns=columns)
348
349        else:
350            raise ValueError("All values in the dictionary must be lists.")
351
352        return df

Convert a dictionary to a pandas DataFrame.

Arguments:

  • one_sets (dict):
    Dictionary to convert.

  • columns (list):
    Column names.

Returns:

  • df (pd.DataFrame)**:
    DataFrame with the dictionary keys as first column and values as second column in columns.

Example:

>>> one_sets = {
... 1: [{1, 2}, {5}],
... 2: [{4, 5}, {6}]
... }
>>> columns = ["A", "B"]
>>> MatrixPatches.one_sets_to_df(one_sets, columns)
   A       B
0  1  {1, 2}
1  1     {5}
2  2  {4, 5}
3  2     {6}
@staticmethod
def aggregate_df_rows(df: pandas.DataFrame, groupby_col: str, agg_col: str):
354    @staticmethod
355    def aggregate_df_rows(
356        df: pd.DataFrame,
357        groupby_col: str,
358        agg_col: str
359    ):
360        """Group a DataFrame by a column and aggregate another column.
361
362        Arguments:
363
364        - **df (pd.DataFrame)**:<br />
365            DataFrame with groupby_col and agg_col.
366
367        - **groupby_col (str)**:<br />
368            Column to group by (each value is a set).
369
370        - **agg_col (str)**:<br />
371            Column to aggregate (each value is a string).
372
373        Returns:
374
375        - **df_group (pd.DataFrame)**:<br />
376            Grouped DataFrame with both columns as a set.
377
378        Example:
379
380            >>> df = pd.DataFrame({
381            ... "A": ["1", "1", "1", "2", "3", "4"],
382            ... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}]
383            ... })
384            >>> MatrixPatches.aggregate_df_rows(df, "B", "A")
385                    B          A
386            0     {1}        {1}
387            1  {1, 2}  {1, 3, 4}
388            2  {4, 5}        {2}
389            3     {5}        {1}
390        """
391
392        df_group = (
393            df.groupby(df[groupby_col].map(tuple))[agg_col]
394            .apply(",".join)
395            .reset_index()
396        )
397        df_group[agg_col] = df_group[agg_col].astype(object)
398        for idx, row in df_group.iterrows():
399            one_set = row[agg_col].split(",")
400            one_set = [int(x) for x in one_set]
401            one_set = sorted(one_set)
402            df_group.at[idx, agg_col] = set(one_set)
403
404        df_group[groupby_col] = df_group[groupby_col].apply(set)
405
406        return df_group

Group a DataFrame by a column and aggregate another column.

Arguments:

  • df (pd.DataFrame):
    DataFrame with groupby_col and agg_col.

  • groupby_col (str):
    Column to group by (each value is a set).

  • agg_col (str):
    Column to aggregate (each value is a string).

Returns:

  • df_group (pd.DataFrame):
    Grouped DataFrame with both columns as a set.

Example:

>>> df = pd.DataFrame({
... "A": ["1", "1", "1", "2", "3", "4"],
... "B": [{1}, {1,2}, {5}, {4,5}, {1,2}, {1,2}]
... })
>>> MatrixPatches.aggregate_df_rows(df, "B", "A")
        B          A
0     {1}        {1}
1  {1, 2}  {1, 3, 4}
2  {4, 5}        {2}
3     {5}        {1}
@staticmethod
def combine_dfs( df1: pandas.DataFrame, df2: pandas.DataFrame, colname_1: str, colname_2: str):
408    @staticmethod
409    def combine_dfs(
410        df1: pd.DataFrame,
411        df2: pd.DataFrame,
412        colname_1: str,
413        colname_2: str
414    ):
415        """Combine two DataFrames with columns colname_1 and colname_2
416        into a new DataFrame with interacting residues ranges without duplicates.
417
418        Arguments:
419
420        - **df1 (pd.DataFrame)**:<br />
421            DataFrame 1.
422
423        - **df2 (pd.DataFrame)**:<br />
424            DataFrame 2.
425
426        - **colname_1 (str)**:<br />
427            Column name 1.
428
429        - **colname_2 (str)**:<br />
430            Column name 2.
431
432        Returns:
433
434        - **new_df (pd.DataFrame)**:<br />
435            Combined DataFrame of interacting residues ranges without
436            duplicates.
437
438        Example:
439
440            >>> df1 = pd.DataFrame({
441            ... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}],
442            ... "B":[{1}, {1, 2, 3}, {2, 3}, {3}]
443            ... })
444            >>> df2 = pd.DataFrame({
445            ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}],
446            ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}]
447            ... })
448
449            >>> MatrixPatches.combine_dfs(df1, df2, "A", "B")
450                 A    B
451            0  0-2    3
452            1    1  1-3
453            2  1-2  2-3
454            3  3-4    1
455            4    4    1
456            5    4    3
457            6    1    1
458        """
459
460        combined_df = pd.concat([df2, df1], axis=0)
461        combined_df.reset_index(drop=True, inplace=True)
462
463        df_rows = []
464
465        for _, row in combined_df.iterrows():
466            if isinstance(row[colname_1], set) and isinstance(
467                row[colname_2], set
468            ):
469                ranges1 = get_key_from_res_range(row[colname_1], as_list=True)
470                ranges2 = get_key_from_res_range(row[colname_2], as_list=True)
471                assert isinstance(ranges1, list) and isinstance(ranges2, list)
472                for res_range1 in ranges1:
473                    for res_range2 in ranges2:
474                        df_rows.append([res_range1, res_range2])
475
476        new_df = pd.DataFrame(df_rows, columns=[colname_1, colname_2])
477        new_df.drop_duplicates(inplace=True, keep=MiscStrEnum.FIRST)
478        new_df.reset_index(drop=True, inplace=True)
479
480        return new_df

Combine two DataFrames with columns colname_1 and colname_2 into a new DataFrame with interacting residues ranges without duplicates.

Arguments:

  • df1 (pd.DataFrame):
    DataFrame 1.

  • df2 (pd.DataFrame):
    DataFrame 2.

  • colname_1 (str):
    Column name 1.

  • colname_2 (str):
    Column name 2.

Returns:

  • new_df (pd.DataFrame):
    Combined DataFrame of interacting residues ranges without duplicates.

Example:

>>> df1 = pd.DataFrame({
... "A":[{1, 3, 4}, {1}, {1, 2}, {0, 1, 2, 4}],
... "B":[{1}, {1, 2, 3}, {2, 3}, {3}]
... })
>>> df2 = pd.DataFrame({
... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}],
... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1, 3}]
... })

>>> MatrixPatches.combine_dfs(df1, df2, "A", "B")
     A    B
0  0-2    3
1    1  1-3
2  1-2  2-3
3  3-4    1
4    4    1
5    4    3
6    1    1
@staticmethod
def remove_subset_rows(df: pandas.DataFrame, colname_1: str, colname_2: str):
482    @staticmethod
483    def remove_subset_rows(
484        df: pd.DataFrame,
485        colname_1: str,
486        colname_2: str
487    ):
488        """Remove rows that are subsets of other rows.
489        (from chatgpt)
490
491        Arguments:
492
493        - **df (pd.DataFrame)**:<br />
494            DataFrame with columns `colname_1` and `colname_2`.
495
496        - **colname_1 (str)**:<br />
497            column name 1.
498
499        - **colname_2 (str)**:<br />
500            column name 2.
501
502        Returns:
503
504        - **filtered_df (pd.DataFrame)**:<br />
505            DataFrame with subset rows removed.
506
507        Example:
508
509        >>> df = pd.DataFrame({
510        ... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}],
511        ... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}]
512        ... })
513        >>> MatrixPatches.remove_subset_rows(df, "A", "B")
514                   A          B
515        0  {0, 1, 2}        {3}
516        1        {1}  {1, 2, 3}
517        2     {1, 2}     {2, 3}
518        3     {3, 4}        {1}
519        4        {4}        {3}
520        """
521
522        rows_to_keep = []
523
524        for i, row in df.iterrows():
525
526            if not any(
527                MatrixPatches.is_subset(
528                    row, df.iloc[j], colname_1, colname_2
529                )
530                for j in range(len(df))
531                if i != j
532            ):
533                rows_to_keep.append(i)
534
535        filtered_df = df.loc[rows_to_keep].reset_index(drop=True)
536
537        return filtered_df

Remove rows that are subsets of other rows. (from chatgpt)

Arguments:

  • df (pd.DataFrame):
    DataFrame with columns colname_1 and colname_2.

  • colname_1 (str):
    column name 1.

  • colname_2 (str):
    column name 2.

Returns:

  • filtered_df (pd.DataFrame):
    DataFrame with subset rows removed.

Example:

>>> df = pd.DataFrame({
... "A": [{0, 1, 2}, {1}, {1, 2}, {3, 4}, {4}, {4}, {1}],
... "B": [{3}, {1, 2, 3}, {2, 3}, {1}, {1}, {3}, {1}]
... })
>>> MatrixPatches.remove_subset_rows(df, "A", "B")
           A          B
0  {0, 1, 2}        {3}
1        {1}  {1, 2, 3}
2     {1, 2}     {2, 3}
3     {3, 4}        {1}
4        {4}        {3}
@staticmethod
def is_subset( row: pandas.Series, other_row: pandas.Series, colname_1: str, colname_2: str):
539    @staticmethod
540    def is_subset(
541        row: pd.Series,
542        other_row: pd.Series,
543        colname_1: str,
544        colname_2: str,
545    ):
546        """Check if row is a subset of other_row for two specified columns.
547
548        Arguments:
549
550        - **row (pd.Series)**:<br />
551            Row to check if it is a subset of other_row.
552
553        - **other_row (pd.Series)**:<br />
554            Row to check against.
555
556        - **colname_1 (str)**:<br />
557            Column name 1.
558
559        - **colname_2 (str)**:<br />
560            Column name 2.
561
562        Returns:
563
564        - **(bool)**:<br />
565            `True` if row is a subset of `other_row`, `False` otherwise.
566
567        Example:
568
569            >>> row = pd.Series({"A": {0, 1, 2}, "B": {3}})
570            >>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}})
571            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
572            True
573            >>> other_row = pd.Series({"A": {1, 2}, "B": {3}})
574            >>> MatrixPatches.is_subset(row, other_row, "A", "B")
575            False
576        """
577
578        return (
579            row[colname_1].issubset(other_row[colname_1])
580            and row[colname_2].issubset(other_row[colname_2])
581        )

Check if row is a subset of other_row for two specified columns.

Arguments:

  • row (pd.Series):
    Row to check if it is a subset of other_row.

  • other_row (pd.Series):
    Row to check against.

  • colname_1 (str):
    Column name 1.

  • colname_2 (str):
    Column name 2.

Returns:

  • (bool):
    True if row is a subset of other_row, False otherwise.

Example:

>>> row = pd.Series({"A": {0, 1, 2}, "B": {3}})
>>> other_row = pd.Series({"A": {0, 1, 2, 3}, "B": {3}})
>>> MatrixPatches.is_subset(row, other_row, "A", "B")
True
>>> other_row = pd.Series({"A": {1, 2}, "B": {3}})
>>> MatrixPatches.is_subset(row, other_row, "A", "B")
False