"""Set of Pytest unit tests for delete_superfluous_records.py This module contains a set of Pytest unit tests for the function delete_superfluous_records() that is contained in erdc.py. """ import pytest import pandas as pd import erdc def test_simple_case(): data_in = { 'CohortID': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'FirstName': ['J', 'Jona', 'John', 'Jonath', 'Jo', 'Jon','John', 'Jh', 'Jonathon', 'Jhon', 'Jr'] } expected_out = { 'CohortID': [0, 1, 1, 1, 1], 'FirstName': ['J', 'John', 'Jonathon', 'Jhon', 'Jr'] } expected_deleted_out = { 'CohortID': [1, 1, 1, 1, 1, 1], 'FirstName': ['Jona', 'Jonath', 'Jo', 'Jon', 'John', 'Jh'], 'WhyDeleted': ['FirstName', 'FirstName', 'FirstName', 'FirstName', 'DISTINCT', 'FirstName', ] } results_out, results_deleted_out = erdc.delete_superfluous_records(data_in, ['CohortID'], ['FirstName']) # The two dataframes returned by delete_superfluous_records() # maintains the original indexes from the data_set_in input # parameter. To compare with expected_out, need to reset_index. results_out.reset_index(drop=True, inplace=True) results_deleted_out.reset_index(drop=True, inplace=True) assert results_out.equals(pd.DataFrame(expected_out)) assert results_deleted_out.equals(pd.DataFrame(expected_deleted_out)) def test_multiple_static_ids(): data_in = { 'CohortID': ['Coh_47', 'Coh_47', 'Coh_47', 'Coh_10', 'Coh_10', 'Coh_10', 'Coh_10'], 'GeographyCode': [22, 22, 32, 22, 71, 71, 71], 'FirstName': ['Nguyet', 'Nguy', 'Nguyet', 'Nguyet', 'Nguyet', 'Nguyet', 'Nguyet'] } expected_out = { 'CohortID': ['Coh_47', 'Coh_47', 'Coh_10', 'Coh_10'], 'GeographyCode': [22, 32, 22, 71], 'FirstName': ['Nguyet', 'Nguyet', 'Nguyet', 'Nguyet'] } expected_deleted_out = { 'CohortID': ['Coh_47', 'Coh_10', 'Coh_10'], 'GeographyCode': [22, 71, 71], 'FirstName': ['Nguy', 'Nguyet', 'Nguyet'], 'WhyDeleted': ['FirstName', 'DISTINCT', 'DISTINCT'] } results_out, results_deleted_out = erdc.delete_superfluous_records(data_in, ['CohortID', 'GeographyCode'], ['FirstName']) # The two dataframes returned by delete_superfluous_records() # maintains the original indexes from the data_set_in input # parameter. To compare with expected_out, need to reset_index. results_out.reset_index(drop=True, inplace=True) results_deleted_out.reset_index(drop=True, inplace=True) assert results_out.equals(pd.DataFrame(expected_out)) assert results_deleted_out.equals(pd.DataFrame(expected_deleted_out)) def test_names(): data_in = { 'ID': [1, 1, 1, 1, 1, 1, 2, 3, 3, 3], 'FirstName': ['John', 'Jon','John', 'John', 'John', 'John', 'Jill', 'Stuart', 'Stua', 'Stuart'], 'MiddleName': ['Rob', 'R', 'Rybert', 'R', 'Robert', 'R', None, 'Beof', 'Beof', 'B'], 'LastName': ['Smith', 'Smith','Smith','Smith', 'Smith', 'Smith', 'Smith', 'Jones', 'Jones', 'Jones'], 'BirthDate': ['1922-01-02', '1922-01-02','1922-01-02', '1922-01-02', None, '1922-01-02','1944-03-04', '1966-05-06', '1966-05-06', '1966-05-06'], 'Pet': ['Cat', 'Dog', 'Dog', 'Goldfish', 'Aardvark', 'Weasel', 'Cat', 'Cow', 'Dog', 'Venus Flytrap'] } expected_out = { 'ID': [1, 1, 1, 1, 2, 3], 'FirstName': ['John','Jon', 'John', 'John', 'Jill', 'Stuart'], 'MiddleName': ['Rob', 'R', 'Rybert', 'Robert', None, 'Beof'], 'LastName': ['Smith', 'Smith','Smith','Smith', 'Smith', 'Jones'], 'BirthDate': ['1922-01-02', '1922-01-02','1922-01-02', None, '1944-03-04', '1966-05-06'], 'Pet': ['Cat', 'Dog', 'Dog', 'Aardvark', 'Cat', 'Cow'] } expected_deleted_out = { 'ID': [1, 1, 3, 3], 'FirstName': ['John', 'John', 'Stua', 'Stuart'], 'MiddleName': ['R', 'R', 'Beof', 'B'], 'LastName': ['Smith', 'Smith', 'Jones', 'Jones'], 'BirthDate': ['1922-01-02', '1922-01-02', '1966-05-06', '1966-05-06'], 'Pet': ['Goldfish', 'Weasel', 'Dog', 'Venus Flytrap'], 'WhyDeleted': ['MiddleName', 'DISTINCT', 'FirstName', 'MiddleName'] } results_out, results_deleted_out = erdc.delete_superfluous_records(pd.DataFrame(data_in), ["ID"], ["MiddleName", "FirstName", "LastName", "BirthDate"]) # The two dataframes returned by delete_superfluous_records() # maintains the original indexes from the data_set_in input # parameter. To compare with expected_out, need to reset_index. results_out.reset_index(drop=True, inplace=True) results_deleted_out.reset_index(drop=True, inplace=True) assert results_out.equals(pd.DataFrame(expected_out)) assert results_deleted_out.equals(pd.DataFrame(expected_deleted_out))