Merge pull request #2 from jftuga/cli-improvements

jftuga · web-flow · commit b2d8f1e207fa · 2025-01-02T08:39:07.000-05:00
cli improvements
diff --git a/Pipfile b/Pipfile
@@ -6,12 +6,12 @@ name = "pypi"
 [dev-packages]
 black = "*"
 ruff = "*"
-veryprettytable = {git = "https://github.com/andrewspiers/VeryPrettyTable.git"}
 
 [packages]
 chardet = ">=5.2.0"
 spacy = ">=3.8.3"
 torch = ">=2.5.1"
+veryprettytable = {git = "https://github.com/andrewspiers/VeryPrettyTable.git"}
 
 [requires]
 python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -36,7 +36,29 @@ pip install VeryPrettyTable
 
 ## Usage
 
-### Basic Usage
+### Command Line Interface
+
+The package includes a command-line tool for quick de-identification of text files:
+
+```bash
+python -m deidentification.deidentify input_file [options]
+```
+
+Options:
+- `-r, --replacement TEXT`: Specify replacement text for identified names (default: "PERSON")
+- `-o, --output FILE`: Output file (defaults to stdout)
+- `-H, --html`: Output in HTML format with highlighted replacements
+- `-d, --debug`: Enable debug mode
+- `-t, --tokens`: Save identified elements to a JSON file (filename--tokens.json)
+- `-v, --version`: Display version information
+
+Example:
+```bash
+# De-identify a text file and save with HTML markup
+python -m deidentification.deidentify input.txt -H -o output.html -r "[REDACTED]"
+```
+
+### Python API Usage
 
 ```python
 from deidentification import Deidentification
diff --git a/deidentification/deidentification.py b/deidentification/deidentification.py
@@ -32,7 +32,6 @@
 from .deidentification_constants import pgmName, pgmUrl, pgmVersion
 from .normalize_punctuation import normalize_punctuation
 import spacy
-import spacy
 from spacy.tokens import Doc
 import sys
 
@@ -47,6 +46,8 @@ class DeidentificationConfig:
     output_style: DeidentificationOutputStyle = DeidentificationOutputStyle.TEXT
     replacement: str = "PERSON"
     debug: bool = False
+    save_tokens: bool = False
+    filename: Optional[str] = None
 
     def __str__(self) -> str:
         return "\n".join(f"- {field.name:<15} = {getattr(self, field.name)}"
@@ -64,10 +65,17 @@ def __init__(self, config: DeidentificationConfig = DeidentificationConfig()):
         """
         self.config = config
         self.all_persons: list[dict] = []
+
+        # this combines all self.all_persons lists from multiple passes of self._find_all_persons()
+        self.aggregate_persons: list[dict] = []
+
         self.all_pronouns: list[dict] = []
         self.doc: Optional[Doc] = None
         self.table_class  = None
 
+        # used by self.get_identified_elements()
+        self.replaced_text = None
+
         if self.config.debug:
             from veryprettytable import VeryPrettyTable
             self.table_class = VeryPrettyTable
@@ -134,6 +142,8 @@ def deidentify(self, text: str) -> str:
             self.all_pronouns = []
             merged = self._merge_metadata()
             replaced_text = self._replace_merged(replaced_text, merged)
+
+        self.replaced_text = replaced_text
         return replaced_text
 
     def deidentify_with_wrapped_html(self, text: str, html_begin: str = HTML_BEGIN, html_end:str = HTML_END) -> str:
@@ -156,6 +166,10 @@ def deidentify_with_wrapped_html(self, text: str, html_begin: str = HTML_BEGIN,
         buffer.write(html_end)
         return buffer.getvalue()
 
+    def get_identified_elements(self) -> dict:
+        elements = {"message": self.replaced_text, "entities": self.aggregate_persons, "pronouns": self.all_pronouns}
+        return elements
+
     def _find_all_persons(self) -> int:
         """Find all person entities in the current document.
 
@@ -179,6 +193,7 @@ def _find_all_persons(self) -> int:
                     continue
                 record = {"text": ent.text, "start_char": ent.start_char, "end_char": ent.end_char, "label": ent.label_, "shapes": [token.shape_ for token in ent]}
                 self.all_persons.append(record)
+        self.aggregate_persons.extend(self.all_persons)
         return len(self.all_persons)
 
     def _find_all_pronouns(self) -> int:
diff --git a/deidentification/deidentification_constants.py b/deidentification/deidentification_constants.py
@@ -1,6 +1,6 @@
 pgmName = "deidentification"
 pgmUrl = "https://github.com/jftuga/deidentification"
-pgmVersion = "1.0.0"
+pgmVersion = "1.1.0"
 
 GENDER_PRONOUNS = {
     "he": "HE/SHE",
@@ -81,3 +81,4 @@ class bcolors:
     ENDC = '\033[0m'
     BOLD = '\033[1m'
     UNDERLINE = '\033[4m'
+
diff --git a/deidentification/deidentify.py b/deidentification/deidentify.py
@@ -3,11 +3,51 @@
 """Command line interface for the deidentification package."""
 
 import argparse
+import json
+import os
 import sys
+from io import StringIO
 from typing import TextIO
 
 from . import __version__
 from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle
+from .deidentification_constants import pgmUrl
+from .file_detection import read_file_with_detection
+
+def create_json_filename(input_file: str) -> str:
+    """Creates the metadata JSON filename for a given input file.
+
+    Takes an input filename and creates a corresponding metadata filename
+    by replacing the original extension with "--tokens.json".
+
+    Args:
+        input_file (str): Original input file path
+
+    Returns:
+        str: Path for the metadata JSON file.
+            Example: "text.txt" -> "text--tokens.json"
+
+    Note:
+        Preserves the original file path, only modifies the extension.
+    """
+    filename, _ = os.path.splitext(input_file)
+    return filename + "--tokens.json"
+
+def save_elements(filename: str, elements: dict):
+    """Saves a dictionary of elements to a JSON file with UTF-8 encoding.
+
+    Args:
+        filename (str): The base filename to save to. Will be converted to a JSON
+            filename using create_json_filename().
+        elements (dict): Dictionary containing the elements to save. Values that
+            aren't JSON-serializable will be converted to strings.
+
+    Returns:
+        None
+    """
+    outfile = create_json_filename(filename)
+    with open(outfile, "w", encoding="utf-8") as fp:
+        fp.write(json.dumps(elements, indent=4, default=str))
 
 
 def process_stream(input_stream: TextIO, config: DeidentificationConfig) -> str:
@@ -18,14 +58,22 @@ def process_stream(input_stream: TextIO, config: DeidentificationConfig) -> str:
         config: DeidentificationConfig instance with processing settings
 
     Returns:
-        str: De-identified text
+        str: The deidentified content as a string. If HTML output style is specified in the config,
+            the content will include HTML markup around deidentified elements.
+
+    Note:
+        If config.save_tokens is True, identified elements will be saved to a JSON file
+        using the filename specified in the config.
     """
     content = input_stream.read()
     deidentifier = Deidentification(config)
-    
-    if config.output_style == DeidentificationOutputStyle.HTML:
-        return deidentifier.deidentify_with_wrapped_html(content)
-    return deidentifier.deidentify(content)
+
+    func = deidentifier.deidentify_with_wrapped_html if config.output_style == DeidentificationOutputStyle.HTML else deidentifier.deidentify
+    content = func(content)
+    if config.save_tokens:
+        elements = deidentifier.get_identified_elements()
+        save_elements(config.filename, elements)
+    return content
 
 
 def main() -> int:
@@ -70,25 +118,49 @@ def main() -> int:
         "-v",
         "--version",
         action="version",
-        version=f"%(prog)s {__version__}",
+        version=f"%(prog)s {__version__} : {pgmUrl}",
         help="display program version and then exit"
     )
-    
+
+    parser.add_argument(
+        "-d",
+        "--debug",
+        action="store_true",
+        default=False,
+        help="enable debug mode"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tokens",
+        action="store_true",
+        default=False,
+        help="save identified elements to file ending in `--tokens.json`"
+    )
+
     args = parser.parse_args()
 
     # Configure deidentification settings
     config = DeidentificationConfig(
         replacement=args.replacement,
-        output_style=DeidentificationOutputStyle.HTML if args.html else DeidentificationOutputStyle.TEXT
+        output_style=DeidentificationOutputStyle.HTML if args.html else DeidentificationOutputStyle.TEXT,
+        debug = args.debug == True,
+        save_tokens = args.tokens == True
     )
 
     try:
         # Handle input
+        config.filename = args.input_file if args.input_file != "-" else "STDIN.txt"
+        if args.debug:
+            print(config, file=sys.stderr)
+
         if args.input_file == "-":
             result = process_stream(sys.stdin, config)
         else:
-            with open(args.input_file, "r", encoding="utf-8") as f:
-                result = process_stream(f, config)
+            file_contents, encoding = read_file_with_detection(args.input_file)
+            if config.debug:
+                print(f"DEBUG: Detected file encoding: {encoding}", file=sys.stderr)
+            result = process_stream(StringIO(file_contents), config)
 
         # Handle output
         if args.output:
diff --git a/deidentification/file_detection.py b/deidentification/file_detection.py
@@ -0,0 +1,47 @@
+
+import chardet
+import sys
+
+def read_file_with_detection(filename: str) -> tuple[str, str|None]:
+    """Detects file encoding and reads its contents in a single file read operation.
+
+    Opens the file once in binary mode, uses the bytes for encoding detection,
+    then decodes those same bytes using the detected encoding.
+
+    Uses the chardet library to analyze the raw bytes of a file and determine its
+    most likely character encoding (e.g., 'utf-8', 'ascii', 'windows-1252', etc.).
+
+    Args:
+        filename: Path to the file to read.
+
+    Returns:
+        A tuple containing (file_contents: str, detected_encoding: str).
+        The file_contents will be decoded using the detected encoding.
+
+    Raises:
+        FileNotFoundError: If the specified file does not exist.
+        IOError: If there are issues reading the file.
+        UnicodeDecodeError: If the content cannot be decoded with the detected encoding.
+    """
+    with open(filename, 'rb') as file:
+        raw_bytes = file.read()
+
+    detected_encoding = chardet.detect(raw_bytes)['encoding']
+    all_encodings = (detected_encoding, "utf-8", "ascii", "cp1252", "latin1", "utf-16", "iso-8859-15", "iso-8859-1", "utf-32", "cp1251", "gb2312", "big5")
+    decode_succeeded = False
+    file_contents = ""
+    for encoding in all_encodings:
+        try:
+            # print(f"Attempting file read with {encoding=} for {filename=}")
+            file_contents = raw_bytes.decode(detected_encoding)
+            decode_succeeded = True
+            break
+        except UnicodeDecodeError:
+            continue
+
+    if not decode_succeeded:
+        print(f"Error: Unable to detect file encoding for: {filename=}", file=sys.stderr)
+        return "", None
+
+    return file_contents, detected_encoding
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,8 @@
 [build-system]
 requires = ["setuptools>=45", "wheel"]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 140
+target-version = ['py312']
+skip_magic_trailing_comma = true
diff --git a/setup.py b/setup.py