33"""Command line interface for the deidentification package."""
44
55import argparse
6+ import json
7+ import os
68import sys
9+ from io import StringIO
710from typing import TextIO
811
912from . import __version__
1013from .deidentification import Deidentification , DeidentificationConfig , DeidentificationOutputStyle
14+ from .deidentification_constants import pgmUrl
15+ from .file_detection import read_file_with_detection
16+
17+ def create_json_filename (input_file : str ) -> str :
18+ """Creates the metadata JSON filename for a given input file.
19+
20+ Takes an input filename and creates a corresponding metadata filename
21+ by replacing the original extension with "--tokens.json".
22+
23+ Args:
24+ input_file (str): Original input file path
25+
26+ Returns:
27+ str: Path for the metadata JSON file.
28+ Example: "text.txt" -> "text--tokens.json"
29+
30+ Note:
31+ Preserves the original file path, only modifies the extension.
32+ """
33+ filename , _ = os .path .splitext (input_file )
34+ return filename + "--tokens.json"
35+
36+ def save_elements (filename : str , elements : dict ):
37+ """Saves a dictionary of elements to a JSON file with UTF-8 encoding.
38+
39+ Args:
40+ filename (str): The base filename to save to. Will be converted to a JSON
41+ filename using create_json_filename().
42+ elements (dict): Dictionary containing the elements to save. Values that
43+ aren't JSON-serializable will be converted to strings.
44+
45+ Returns:
46+ None
47+ """
48+ outfile = create_json_filename (filename )
49+ with open (outfile , "w" , encoding = "utf-8" ) as fp :
50+ fp .write (json .dumps (elements , indent = 4 , default = str ))
1151
1252
1353def process_stream (input_stream : TextIO , config : DeidentificationConfig ) -> str :
@@ -18,14 +58,22 @@ def process_stream(input_stream: TextIO, config: DeidentificationConfig) -> str:
1858 config: DeidentificationConfig instance with processing settings
1959
2060 Returns:
21- str: De-identified text
61+ str: The deidentified content as a string. If HTML output style is specified in the config,
62+ the content will include HTML markup around deidentified elements.
63+
64+ Note:
65+ If config.save_tokens is True, identified elements will be saved to a JSON file
66+ using the filename specified in the config.
2267 """
2368 content = input_stream .read ()
2469 deidentifier = Deidentification (config )
25-
26- if config .output_style == DeidentificationOutputStyle .HTML :
27- return deidentifier .deidentify_with_wrapped_html (content )
28- return deidentifier .deidentify (content )
70+
71+ func = deidentifier .deidentify_with_wrapped_html if config .output_style == DeidentificationOutputStyle .HTML else deidentifier .deidentify
72+ content = func (content )
73+ if config .save_tokens :
74+ elements = deidentifier .get_identified_elements ()
75+ save_elements (config .filename , elements )
76+ return content
2977
3078
3179def main () -> int :
@@ -70,25 +118,49 @@ def main() -> int:
70118 "-v" ,
71119 "--version" ,
72120 action = "version" ,
73- version = f"%(prog)s { __version__ } " ,
121+ version = f"%(prog)s { __version__ } : { pgmUrl } " ,
74122 help = "display program version and then exit"
75123 )
76-
124+
125+ parser .add_argument (
126+ "-d" ,
127+ "--debug" ,
128+ action = "store_true" ,
129+ default = False ,
130+ help = "enable debug mode"
131+ )
132+
133+ parser .add_argument (
134+ "-t" ,
135+ "--tokens" ,
136+ action = "store_true" ,
137+ default = False ,
138+ help = "save identified elements to file ending in `--tokens.json`"
139+ )
140+
77141 args = parser .parse_args ()
78142
79143 # Configure deidentification settings
80144 config = DeidentificationConfig (
81145 replacement = args .replacement ,
82- output_style = DeidentificationOutputStyle .HTML if args .html else DeidentificationOutputStyle .TEXT
146+ output_style = DeidentificationOutputStyle .HTML if args .html else DeidentificationOutputStyle .TEXT ,
147+ debug = args .debug == True ,
148+ save_tokens = args .tokens == True
83149 )
84150
85151 try :
86152 # Handle input
153+ config .filename = args .input_file if args .input_file != "-" else "STDIN.txt"
154+ if args .debug :
155+ print (config , file = sys .stderr )
156+
87157 if args .input_file == "-" :
88158 result = process_stream (sys .stdin , config )
89159 else :
90- with open (args .input_file , "r" , encoding = "utf-8" ) as f :
91- result = process_stream (f , config )
160+ file_contents , encoding = read_file_with_detection (args .input_file )
161+ if config .debug :
162+ print (f"DEBUG: Detected file encoding: { encoding } " , file = sys .stderr )
163+ result = process_stream (StringIO (file_contents ), config )
92164
93165 # Handle output
94166 if args .output :
0 commit comments