diff options
-rw-r--r-- | qolab/tableflow/__init__.py | 63 | ||||
-rw-r--r-- | tests/tableflow_test_data/tableIn1.csv | 7 | ||||
-rw-r--r-- | tests/test_tableflow.py | 50 |
3 files changed, 120 insertions, 0 deletions
diff --git a/qolab/tableflow/__init__.py b/qolab/tableflow/__init__.py new file mode 100644 index 0000000..a83120e --- /dev/null +++ b/qolab/tableflow/__init__.py @@ -0,0 +1,63 @@ +""" +Provide basic method to process data describing tables +Created by Eugeniy E. Mikhailov 2024/05/27 + +The basic idea that we will have an *input* table +with data description and we (re)generate *output* table +based on the input table with processed rows. + +If output table already have processed rows with entries different from NA +such rows are skipped. + +Super handy for bulk processing data files where only a few parameters changed. +""" + +import pandas as pd + +def loadInOutTables(inputFileName=None, outputFileName=None, comment=None): + if not inputFileName: + return None, None + + if not comment: + comment = '#' + + tIn = pd.read_csv(inputFileName, comment=comment) + tIn.columns = tIn.columns.str.removeprefix(' '); # clean up leading white space in columns names + + try: + tOut=pd.read_csv(results_file) + except Exception: + tOut=tIn.copy(deep=True) + + return tIn, tOut + +def ilocRowOrAdd(tbl, row): + # Find similar 'row' in 'tbl', NA in both set treated as a hit. + # if similar row not found, insert it. + tSub = tbl[row.keys()] + res = (tSub == row) | (tSub.isna() & row.isna() ) + res = res.all(axis=1) # which rows coincide + if res.any(): + # we have a similar row + i = res[res].index[0] + else: + # we need to create new row since tbl does not has it + i=len(tbl) + updateTblRowAt(tbl, i, row) + return i + +def updateTblRowAt(tbl, i, row): + for k in row.keys(): + tbl.at[i, k] = row[k] + return + +def isRedoNeeded(row, cols2check): + # redo is required if all required entries in cols2check are NA + # or we are missing columns in cols2check list + for c in cols2check: + if c not in row.keys(): + return True + if row[cols2check].isna().all(): + return True + return False + diff --git a/tests/tableflow_test_data/tableIn1.csv b/tests/tableflow_test_data/tableIn1.csv new file mode 100644 index 0000000..ba61273 --- /dev/null +++ b/tests/tableflow_test_data/tableIn1.csv @@ -0,0 +1,7 @@ +# this is comment line1 +# this is comment line2 +x,y,z +1,2,3 +2,3,4 +4,5,6 + diff --git a/tests/test_tableflow.py b/tests/test_tableflow.py new file mode 100644 index 0000000..6b16046 --- /dev/null +++ b/tests/test_tableflow.py @@ -0,0 +1,50 @@ +import pytest +import qolab.tableflow as tblfl +import pandas as pd + +def test_noinputs(): + assert tblfl.loadInOutTables() == (None, None) + assert tblfl.loadInOutTables(inputFileName=None, outputFileName="non_existing_file") == (None, None) + +def test_wrong_comment(): + with pytest.raises(Exception) as exc_info: + # should raise ParserError + tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='%') + +def test_right_comment(): + tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#') + assert type(tIn) == pd.core.frame.DataFrame + +def test_right_comment(): + tIn,tOut = tblfl.loadInOutTables(inputFileName='tests/tableflow_test_data/tableIn1.csv', outputFileName=None, comment='#') + assert type(tIn) == pd.core.frame.DataFrame + + +def test_for_existing_row(): + tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]}) + r = pd.Series({'a':2, 'b':4}) + assert tblfl.ilocRowOrAdd(tbl1, r) == 1 + +def test_for_existing_row_with_NA(): + # NA in both table and raw should return a hit + tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,pd.NA,6]}) + r = pd.Series({'a':2, 'b':pd.NA}) + assert tblfl.ilocRowOrAdd(tbl1, r) == 1 + + # should insert new row + tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]}) + r = pd.Series({'a':2, 'b':pd.NA}) + assert tblfl.ilocRowOrAdd(tbl1, r) == 3 + + # should insert new row + tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]}) + r = pd.Series({'a':2, 'b':pd.NA}) + assert tblfl.ilocRowOrAdd(tbl1, r) == 3 + +def test_for_nonexisting_row_and_its_insertion(): + tbl1 = pd.DataFrame( {'a':[1,2,3], 'b':[1,4,6]}) + r = pd.Series({'a':2, 'b':10}) + assert len(tbl1) == 3 + assert tblfl.ilocRowOrAdd(tbl1, r) == 3 + assert len(tbl1) == 4 + |