qolab/tableflow/__init__.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

"""
Provide basic method to process data describing tables.

Created by Eugeniy E. Mikhailov 2024/05/27

The basic idea that we will have an *input* table
with data description and we (re)generate *output* table
based on the input table with processed rows.

If output table already have processed rows with entries different from NA
such rows are skipped.

Super handy for bulk processing data files where only a few parameters changed.
"""

import pandas as pd
import warnings


def loadInOutTables(inputFileName=None, outputFileName=None, comment=None):
    """Load the input and the output tables from files.

    The output table loaded only if the corresponding file exists.
    Otherwise it is a clone of the input table.

    Parameters
    ==========
    inputFileName : path or string
        Path to the input table filename. If this file does not exists,
        return None for both tables.
    outputFileName : path or string or None
        Path to the output table filename. If such file does not exit,
        clone the input table to the output one.
    comment : string or None (default)
        String which indicates a comment in the input `csv` file.
        Usually it is either '#' or '%'. If set to None, internally changed to '#'.
    """
    if not inputFileName:
        return None, None

    if not comment:
        comment = "#"

    tIn = pd.read_csv(inputFileName, comment=comment)
    tIn.columns = tIn.columns.str.removeprefix(" ")
    # clean up leading white space in columns names

    try:
        tOut = pd.read_csv(outputFileName, comment=comment)
    except Exception:
        tOut = tIn.copy(deep=True)

    return tIn, tOut


def ilocRowOrAdd(tbl, row):
    """Find a row in a table (`tbl`) similar to a provided `row`.

    NA in both sets treated as a match.
    If similar 'row' not found in the table, insert it.
    """
    tSub = tbl[row.keys()]
    res = (tSub == row) | (tSub.isna() & row.isna())
    res = res.all(axis=1)  # which rows coincide
    if res.any():
        # we have a similar row
        i = res[res].index[0]
    else:
        # we need to create new row since tbl does not has it
        i = len(tbl)
        updateTblRowAt(tbl, i, row)
    return i


def updateTblRowAt(tbl, i, row):
    """Update row  with position 'i' in the table ('tbl') with values from 'row'."""
    for k in row.keys():
        tbl.at[i, k] = row[k]
    return


def isRedoNeeded(row, cols2check):
    """Check is Redo required in a given row.

    Redo is required if *all* required entries in 'cols2check' are NA
    or we are missing columns in cols2check list

    Parameters
    ==========
    row: pandas row
        row to perform check on
    cols2check: list of strings
        List of strings with column names which considered as generated outputs.
    """
    for c in cols2check:
        if c not in row.keys():
            return True
    if row[cols2check].isna().all():
        return True
    return False


def reflowTable(
    tIn,
    tOut,
    process_row_func=None,
    postProcessedColums=None,
    extraInfo=None,
    redo=False,
):
    """Reflow/update table tOut in place based on the inputs specified in table tIn.

    Effectively maps unprocessed rows to ``process_row_func``.

    Parameters
    ==========
    postProcessedColums : list of strings
        List of column names which need to be generated
    extraInfo : dictionary (optional)
        Dictionary of additional parameter supplied to ``process_row_func``
    process_row_func : function
        Function which will take a row from the input table and generate
        row with post processed entries (columns).
        Expected to have signature like:
        ``rowOut = process_row_func(rowIn, extraInfo=userInfo)``
    redo : True or False (default)
        Flag indicating if reflow is needed unconditionally
        (i.e. True forces reflow of all entries).
    """
    if not process_row_func:
        warnings.warn("process_row_func is not provided, exiting reflowTable")
        return
    if not postProcessedColums:
        warnings.warn("postProcessedColums are not provided, exiting reflowTable")
        return

    for index, rowIn in tIn.iterrows():
        iOut = ilocRowOrAdd(tOut, rowIn)
        rowOutBefore = tOut.iloc[iOut]

        if not (redo or isRedoNeeded(rowOutBefore, postProcessedColums)):
            continue

        # processing data describing row
        rowOut = process_row_func(rowOutBefore, extraInfo=extraInfo)
        updateTblRowAt(tOut, iOut, rowOut)