👔 Deduplicate

shnizzedy · shnizzedy · commit 6286bb5e679d · 2026-01-30T15:13:37.000-05:00
diff --git a/src/mindlogger_data_export/processors.py b/src/mindlogger_data_export/processors.py
@@ -104,6 +104,52 @@ def _run(self, report: pl.DataFrame) -> pl.DataFrame:
         )
 
 
+class DeduplicateResponsesProcessor(ReportProcessor):
+    """Deduplicate responses, keeping latest by "activity_end_time".
+
+    This processor removes duplicate item responses for the same
+    user/activity/submission combination, keeping only the most recent entry.
+    """
+
+    NAME = "DeduplicateResponses"
+    PRIORITY = 9  # after datetime handling
+    ENABLE = True
+
+    def _run(self, report: pl.DataFrame) -> pl.DataFrame:
+        """Deduplicate report by keeping latest activity_end_time."""
+        # Define the columns that should be unique
+        unique_cols = ["user_id", "activity_id", "activity_submission_id", "item_id"]
+
+        # Check which columns actually exist in the report
+        existing_unique_cols = [col for col in unique_cols if col in report.columns]
+
+        # Check for duplicates
+        duplicate_check = report.group_by(existing_unique_cols).agg(
+            pl.count().alias("count")
+        )
+        duplicates = duplicate_check.filter(pl.col("count") > 1)
+
+        if duplicates.height > 0:
+            LOG.warning(
+                "Found %d duplicate item responses. "
+                "Keeping the latest entry by activity_end_time.",
+                duplicates.height,
+            )
+            LOG.debug("Duplicate details:\n%s", duplicates)
+
+            # Deduplicate by keeping the row with the latest activity_end_time
+            report = report.sort("activity_end_time", descending=True).unique(
+                subset=existing_unique_cols, keep="first"
+            )
+
+            LOG.info(
+                "Removed %d duplicate rows",
+                duplicates["count"].sum() - duplicates.height,
+            )
+
+        return report
+
+
 class ResponseStructProcessor(ReportProcessor):
     """Convert response to struct using Lark.