qolab/tableflow/__init__.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

"""
Provide basic method to process data describing tables
Created by Eugeniy E. Mikhailov 2024/05/27

The basic idea that we will have an *input* table
with data description and we (re)generate *output* table
based on the input table with processed rows.

If output table already have processed rows with entries different from NA
such rows are skipped.

Super handy for bulk processing data files where only a few parameters changed.
"""

import pandas as pd
import warnings

def loadInOutTables(inputFileName=None, outputFileName=None, comment=None):
    """Load input table from a file and if exist output table too, otherwise clone input table to the output one."""
    if not inputFileName:
        return None, None

    if not comment:
        comment = '#'

    tIn = pd.read_csv(inputFileName, comment=comment)
    tIn.columns = tIn.columns.str.removeprefix(' '); # clean up leading white space in columns names

    try:
        tOut=pd.read_csv(outputFileName, comment=comment)
    except Exception:
        tOut=tIn.copy(deep=True)

    return tIn, tOut

def ilocRowOrAdd(tbl, row):
    """Find a row in a table ('tbl') similar to a provided 'row' in 'tbl'.

    NA in both sets treated as a match.
    If similar 'row' not found in the table, insert it. 
    """
    tSub = tbl[row.keys()]
    res = (tSub == row) | (tSub.isna() & row.isna() )
    res = res.all(axis=1) # which rows coincide
    if res.any():
        # we have a similar row
        i = res[res].index[0]
    else:
        # we need to create new row since tbl does not has it
        i=len(tbl)
        updateTblRowAt(tbl, i, row)
    return i

def updateTblRowAt(tbl, i, row):
    """Update row  with position 'i' in the table ('tbl') with values from 'row'."""
    for k in row.keys():
        tbl.at[i, k] = row[k]
    return

def isRedoNeeded(row, cols2check):
    """Check is Redo required in a given row.

    Redo is required if *all* required entries in 'cols2check' are NA
    or we are missing columns in cols2check list
    """
    for c in cols2check:
        if c not in row.keys():
            return True
    if row[cols2check].isna().all():
        return True
    return False

def reflowTable(tIn, tOut, process_row_func=None, postProcessedColums=None, extraInfo=None, redo=False):
    """Reflow/update tOut in place based on the inputs specified in tIn.

    Effectively maps unprocess rows to 'process_row_func'
    - postProcessedColums is a list of column names which need to be generated
    - extraInfo is dictionary of additional parameter supplied to process_row_func
    - process_row_func expected to behave like:
      rowOut = process_row_func(rowIn, extraInfo=userInfo)
    - redo controls if reflow is needed unconditionally (i.e. force reflow)
    """
    if not process_row_func:
        warnings.warn("process_row_func is not provided, exiting reflowTable")
        return
    if not postProcessedColums:
        warnings.warn("postProcessedColums are not provided, exiting reflowTable")
        return

    for index, rowIn in tIn.iterrows():
        iOut = ilocRowOrAdd(tOut, rowIn)
        rowOutBefore = tOut.iloc[iOut]

        if not (redo or isRedoNeeded(rowOutBefore, postProcessedColums) ):
            continue

        # processing data describing row
        rowOut = process_row_func(rowOutBefore, extraInfo=extraInfo)
        updateTblRowAt(tOut, iOut, rowOut)