@@ -137,8 +137,6 @@ def _make_test_input_csv(tmp_path, t: TestFileDescription) -> list[list[Decimal]
137137
138138def test_snakemake_pipeline_runs_via_exporter_wrapper (tmp_path : Path ):
139139 # ARRANGE
140- repo_root = Path (__file__ ).resolve ().parents [1 ]
141- compose_file = repo_root / "docker-compose.yml"
142140
143141 # all fields that need to be de-IDed should contain the string "SECRET" so we can search for it later
144142 file1 = TestFileDescription (
@@ -194,8 +192,9 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path):
194192 )
195193 test_data_files = []
196194 for f in [file1 , file2 , file3 , file4 ]:
197- test_data = _make_test_input_csv (tmp_path , f )
198- test_data_files .append ((f , test_data ))
195+ test_data_values = _make_test_input_csv (tmp_path , f )
196+ test_data_files .append ((f , test_data_values ))
197+
199198 expected_hash_summaries = {
200199 "2025-01-01" : [
201200 {
@@ -228,6 +227,42 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path):
228227 }
229228
230229 # ACT
230+ run_snakemake (tmp_path )
231+
232+ # ASSERT (data files)
233+ for filename , expected_data in test_data_files :
234+ original_parquet_path = (
235+ tmp_path / "original-parquet" / filename .get_orig_parquet ()
236+ )
237+ pseudon_path = tmp_path / "pseudonymised" / filename .get_pseudon_parquet ()
238+
239+ assert original_parquet_path .exists ()
240+ assert pseudon_path .exists ()
241+
242+ _compare_original_parquet_to_expected (original_parquet_path , expected_data )
243+ _compare_parquets (expected_data , original_parquet_path , pseudon_path )
244+
245+ # ASSERT (hash summaries)
246+ # Hash summaries are one per day, not per input file
247+ for datestr , expected_summary in expected_hash_summaries .items ():
248+ expected_path = tmp_path / "hash-lookups" / f"{ datestr } .hashes.json"
249+ actual_hash_lookup_data = json .loads (expected_path .read_text ())
250+ assert isinstance (actual_hash_lookup_data , list )
251+ # sort order to match expected
252+ actual_hash_lookup_data .sort (key = lambda x : x ["csn" ])
253+ assert expected_summary == actual_hash_lookup_data
254+
255+ # check no extraneous files
256+ assert 4 == len (list ((tmp_path / "original-csv" ).iterdir ()))
257+ assert 4 == len (list ((tmp_path / "original-parquet" ).iterdir ()))
258+ assert 4 == len (list ((tmp_path / "pseudonymised" ).iterdir ()))
259+ assert 2 == len (list ((tmp_path / "hash-lookups" ).iterdir ()))
260+
261+
262+ def run_snakemake (tmp_path ):
263+ repo_root = Path (__file__ ).resolve ().parents [1 ]
264+ compose_file = repo_root / "docker-compose.yml"
265+
231266 compose_args = [
232267 "run" ,
233268 "--rm" ,
@@ -259,35 +294,6 @@ def test_snakemake_pipeline_runs_via_exporter_wrapper(tmp_path: Path):
259294 print (f"stdout:\n { result .stdout } \n " f"stderr:\n { result .stderr } " )
260295 result .check_returncode ()
261296
262- # ASSERT
263- for filename , expected_data in test_data_files :
264- original_parquet_path = (
265- tmp_path / "original-parquet" / filename .get_orig_parquet ()
266- )
267- pseudon_path = tmp_path / "pseudonymised" / filename .get_pseudon_parquet ()
268-
269- assert original_parquet_path .exists ()
270- assert pseudon_path .exists ()
271-
272- _compare_original_parquet_to_expected (original_parquet_path , expected_data )
273- _compare_parquets (expected_data , original_parquet_path , pseudon_path )
274-
275- # Check hash summaries: one per day, not per input file
276- # inspect our CSN -> hashed_csn lookup file
277- for datestr , expected_summary in expected_hash_summaries .items ():
278- expected_path = tmp_path / "hash-lookups" / f"{ datestr } .hashes.json"
279- actual_hash_lookup_data = json .loads (expected_path .read_text ())
280- assert isinstance (actual_hash_lookup_data , list )
281- # sort order to match expected
282- actual_hash_lookup_data .sort (key = lambda x : x ["csn" ])
283- assert expected_summary == actual_hash_lookup_data
284-
285- # check no extraneous files
286- assert 4 == len (list ((tmp_path / "original-csv" ).iterdir ()))
287- assert 4 == len (list ((tmp_path / "original-parquet" ).iterdir ()))
288- assert 4 == len (list ((tmp_path / "pseudonymised" ).iterdir ()))
289- assert 2 == len (list ((tmp_path / "hash-lookups" ).iterdir ()))
290-
291297
292298def _compare_original_parquet_to_expected (original_parquet : Path , expected_test_values ):
293299 # CSV should always match original parquet
0 commit comments