aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugeniy E. Mikhailov <evgmik@gmail.com>2024-05-28 00:26:23 -0400
committerEugeniy E. Mikhailov <evgmik@gmail.com>2024-05-28 00:26:23 -0400
commitec39d0f666a91e73306bba7d809db52747cb6836 (patch)
treee751750f711e16a302e9d689aa3281cf5f771b1a
parentf66c475ed89ab125cff573ffa692e3d29a596fd5 (diff)
downloadqolab-ec39d0f666a91e73306bba7d809db52747cb6836.tar.gz
qolab-ec39d0f666a91e73306bba7d809db52747cb6836.zip
added some preliminary files for table flow
-rw-r--r--qolab/tableflow/__init__.py63
-rw-r--r--tests/tableflow_test_data/tableIn1.csv7
-rw-r--r--tests/test_tableflow.py50
3 files changed, 120 insertions, 0 deletions
diff --git a/qolab/tableflow/__init__.py b/qolab/tableflow/__init__.py
new file mode 100644
index 0000000..a83120e
--- /dev/null
+++ b/qolab/tableflow/__init__.py
@@ -0,0 +1,63 @@
+"""
+Provide basic method to process data describing tables
+Created by Eugeniy E. Mikhailov 2024/05/27
+
+The basic idea that we will have an *input* table
+with data description and we (re)generate *output* table
+based on the input table with processed rows.
+
+If output table already have processed rows with entries different from NA
+such rows are skipped.
+
+Super handy for bulk processing data files where only a few parameters changed.
+"""
+
+import pandas as pd
+
+def loadInOutTables(inputFileName=None, outputFileName=None, comment=None):
+ if not inputFileName:
+ return None, None
+
+ if not comment:
+ comment = '#'
+
+ tIn = pd.read_csv(inputFileName, comment=comment)
+ tIn.columns = tIn.columns.str.removeprefix(' '); # clean up leading white space in columns names
+
+ try:
+ tOut=pd.read_csv(results_file)
+ except Exception:
+ tOut=tIn.copy(deep=True)
+
+ return tIn, tOut
+
+def ilocRowOrAdd(tbl, row):
+ # Find similar 'row' in 'tbl', NA in both set treated as a hit.
+ # if similar row not found, insert it.
+ tSub = tbl[row.keys()]
+ res = (tSub == row) | (tSub.isna() & row.isna() )
+ res = res.all(axis=1) # which rows coincide
+ if res.any():
+ # we have a similar row
+ i = res[res].index[0]
+ else:
+ # we need to create new row since tbl does not has it
+ i=len(tbl)
+ updateTblRowAt(tbl, i, row)
+ return i
+
+def updateTblRowAt(tbl, i, row):
+ for k in row.keys():
+ tbl.at[i, k] = row[k]
+ return
+
+def isRedoNeeded(row, cols2check):
+ # redo is required if all required entries in cols2check are NA
+ # or we are missing columns in cols2check list
+ for c in cols2check:
+ if c not in row.keys():
+ return True
+ if row[cols2check].isna().all():
+ return True
+ return False
+
diff --git a/tests/tableflow_test_data/tableIn1.csv b/tests/tableflow_test_data/tableIn1.csv
new file mode 100644
index 0000000..ba61273
--- /dev/null
+++ b/tests/tableflow_test_data/tableIn1.csv
@@ -0,0 +1,7 @@
+# this is comment line1
+# this is comment line2
+x,y,z
+1,2,3
+2,3,4
+4,5,6
+
diff --git a/tests/test_tableflow.py b/tests/test_tableflow.py
new file mode 100644
index 0000000..6b16046
--- /dev/null
+++ b/tests/test_tableflow.py
@@ -0,0 +1,50 @@
+import pytest
+import qolab.tableflow as tblfl
+import pandas as pd
+
+def test_noinputs():
+ assert tblfl.loadInOutTables() == (None, None)
+ assert tblfl.loadInOutTables(inputFileName=None, outputFileName="non_existing_file") == (None, None)
+
+def test_wrong_comment():
+ with pytest.raises(Exception) as exc_info:
+ # should raise ParserError
+ tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='%')
+
+def test_right_comment():
+ tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#')
+ assert type(tIn) == pd.core.frame.DataFrame
+
+def test_right_comment():
+ tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#')
+ assert type(tIn) == pd.core.frame.DataFrame
+
+
+def test_for_existing_row():
+ tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+ r = pd.Series({'a':2, 'b':4})
+ assert tblfl.ilocRowOrAdd(tbl1, r) == 1
+
+def test_for_existing_row_with_NA():
+ # NA in both table and raw should return a hit
+ tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,pd.NA,6]})
+ r = pd.Series({'a':2, 'b':pd.NA})
+ assert tblfl.ilocRowOrAdd(tbl1, r) == 1
+
+ # should insert new row
+ tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+ r = pd.Series({'a':2, 'b':pd.NA})
+ assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+
+ # should insert new row
+ tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+ r = pd.Series({'a':2, 'b':pd.NA})
+ assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+
+def test_for_nonexisting_row_and_its_insertion():
+ tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]})
+ r = pd.Series({'a':2, 'b':10})
+ assert len(tbl1) == 3
+ assert tblfl.ilocRowOrAdd(tbl1, r) == 3
+ assert len(tbl1) == 4
+