added some preliminary files for table flow

author: Eugeniy E. Mikhailov <evgmik@gmail.com> 2024-05-28 00:26:23 -0400
committer: Eugeniy E. Mikhailov <evgmik@gmail.com> 2024-05-28 00:26:23 -0400
commit: ec39d0f666a91e73306bba7d809db52747cb6836 (patch)
tree: e751750f711e16a302e9d689aa3281cf5f771b1a
parent: f66c475ed89ab125cff573ffa692e3d29a596fd5 (diff)
download: qolab-ec39d0f666a91e73306bba7d809db52747cb6836.tar.gz
qolab-ec39d0f666a91e73306bba7d809db52747cb6836.zip
3 files changed, 120 insertions, 0 deletions
diff --git a/qolab/tableflow/__init__.py b/qolab/tableflow/__init__.py
new file mode 100644
index 0000000..a83120e
--- /dev/null
+++ b/qolab/tableflow/__init__.py
@@ -0,0 +1,63 @@
+"""
+Provide basic method to process data describing tables
+Created by Eugeniy E. Mikhailov 2024/05/27
+
+The basic idea that we will have an *input* table
+with data description and we (re)generate *output* table
+based on the input table with processed rows.
+
+If output table already have processed rows with entries different from NA
+such rows are skipped.
+
+Super handy for bulk processing data files where only a few parameters changed.
+"""
+
+import pandas as pd
+
+def loadInOutTables(inputFileName=None, outputFileName=None, comment=None):
+    if not inputFileName:
+        return None, None
+
+    if not comment:
+        comment = '#'
+
+    tIn = pd.read_csv(inputFileName, comment=comment)
+    tIn.columns = tIn.columns.str.removeprefix(' '); # clean up leading white space in columns names
+
+    try:
+        tOut=pd.read_csv(results_file)
+    except Exception:
+        tOut=tIn.copy(deep=True)
+
+    return tIn, tOut
+
+def ilocRowOrAdd(tbl, row):
+    # Find similar 'row' in 'tbl', NA in both set treated as a hit.
+    # if similar row not found, insert it.
+    tSub = tbl[row.keys()]
+    res = (tSub == row) | (tSub.isna() & row.isna() )
+    res = res.all(axis=1) # which rows coincide
+    if res.any():
+        # we have a similar row
+        i = res[res].index[0]
+    else:
+        # we need to create new row since tbl does not has it
+        i=len(tbl)
+        updateTblRowAt(tbl, i, row)
+    return i
+
+def updateTblRowAt(tbl, i, row):
+    for k in row.keys():
+        tbl.at[i, k] = row[k]
+    return
+
+def isRedoNeeded(row, cols2check):
+    # redo is required if all required entries in cols2check are NA
+    # or we are missing columns in cols2check list
+    for c in cols2check:
+        if c not in row.keys():
+            return True
+    if row[cols2check].isna().all():
+        return True
+    return False
+
diff --git a/tests/tableflow_test_data/tableIn1.csv b/tests/tableflow_test_data/tableIn1.csv
new file mode 100644
index 0000000..ba61273
--- /dev/null
+++ b/tests/tableflow_test_data/tableIn1.csv
@@ -0,0 +1,7 @@
+# this is comment line1
+# this is comment line2
+x,y,z
+1,2,3
+2,3,4
+4,5,6
+
diff --git a/tests/test_tableflow.py b/tests/test_tableflow.py
new file mode 100644
index 0000000..6b16046
--- /dev/null
+++ b/tests/test_tableflow.py
@@ -0,0 +1,50 @@
+import pytest
+import qolab.tableflow as tblfl
+import pandas as pd
+
+def test_noinputs():
+    assert tblfl.loadInOutTables() == (None, None)
+    assert tblfl.loadInOutTables(inputFileName=None, outputFileName="non_existing_file") == (None, None)
+
+def test_wrong_comment():
+    with pytest.raises(Exception) as exc_info:
+        # should raise ParserError
+        tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='%')
+
+def test_right_comment():
+    tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#')
+    assert type(tIn) == pd.core.frame.DataFrame
+
+def test_right_comment():
+    tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#')
+    assert type(tIn) == pd.core.frame.DataFrame
+
+
+def test_for_existing_row():
+    tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+    r = pd.Series({'a':2, 'b':4})
+    assert tblfl.ilocRowOrAdd(tbl1, r) == 1
+
+def test_for_existing_row_with_NA():
+    # NA in both table and raw should return a hit
+    tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,pd.NA,6]})
+    r = pd.Series({'a':2, 'b':pd.NA})
+    assert tblfl.ilocRowOrAdd(tbl1, r) == 1
+
+    # should insert new row
+    tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+    r = pd.Series({'a':2, 'b':pd.NA})
+    assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+
+    # should insert new row
+    tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+    r = pd.Series({'a':2, 'b':pd.NA})
+    assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+
+def test_for_nonexisting_row_and_its_insertion():
+    tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+    r = pd.Series({'a':2, 'b':10})
+    assert len(tbl1) == 3
+    assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+    assert len(tbl1) == 4
+
author	Eugeniy E. Mikhailov <evgmik@gmail.com>	2024-05-28 00:26:23 -0400
committer	Eugeniy E. Mikhailov <evgmik@gmail.com>	2024-05-28 00:26:23 -0400
commit	ec39d0f666a91e73306bba7d809db52747cb6836 (patch)
tree	e751750f711e16a302e9d689aa3281cf5f771b1a
parent	f66c475ed89ab125cff573ffa692e3d29a596fd5 (diff)
download	qolab-ec39d0f666a91e73306bba7d809db52747cb6836.tar.gz qolab-ec39d0f666a91e73306bba7d809db52747cb6836.zip