Skip to content

Commit b2d8f1e

Browse files
authored
Merge pull request #2 from jftuga/cli-improvements
cli improvements
2 parents 1c10533 + c05bf4a commit b2d8f1e

File tree

9 files changed

+201
-38
lines changed

9 files changed

+201
-38
lines changed

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ name = "pypi"
66
[dev-packages]
77
black = "*"
88
ruff = "*"
9-
veryprettytable = {git = "https://github.com/andrewspiers/VeryPrettyTable.git"}
109

1110
[packages]
1211
chardet = ">=5.2.0"
1312
spacy = ">=3.8.3"
1413
torch = ">=2.5.1"
14+
veryprettytable = {git = "https://github.com/andrewspiers/VeryPrettyTable.git"}
1515

1616
[requires]
1717
python_version = "3.12"

Pipfile.lock

Lines changed: 21 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,29 @@ pip install VeryPrettyTable
3636

3737
## Usage
3838

39-
### Basic Usage
39+
### Command Line Interface
40+
41+
The package includes a command-line tool for quick de-identification of text files:
42+
43+
```bash
44+
python -m deidentification.deidentify input_file [options]
45+
```
46+
47+
Options:
48+
- `-r, --replacement TEXT`: Specify replacement text for identified names (default: "PERSON")
49+
- `-o, --output FILE`: Output file (defaults to stdout)
50+
- `-H, --html`: Output in HTML format with highlighted replacements
51+
- `-d, --debug`: Enable debug mode
52+
- `-t, --tokens`: Save identified elements to a JSON file (filename--tokens.json)
53+
- `-v, --version`: Display version information
54+
55+
Example:
56+
```bash
57+
# De-identify a text file and save with HTML markup
58+
python -m deidentification.deidentify input.txt -H -o output.html -r "[REDACTED]"
59+
```
60+
61+
### Python API Usage
4062

4163
```python
4264
from deidentification import Deidentification

deidentification/deidentification.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from .deidentification_constants import pgmName, pgmUrl, pgmVersion
3333
from .normalize_punctuation import normalize_punctuation
3434
import spacy
35-
import spacy
3635
from spacy.tokens import Doc
3736
import sys
3837

@@ -47,6 +46,8 @@ class DeidentificationConfig:
4746
output_style: DeidentificationOutputStyle = DeidentificationOutputStyle.TEXT
4847
replacement: str = "PERSON"
4948
debug: bool = False
49+
save_tokens: bool = False
50+
filename: Optional[str] = None
5051

5152
def __str__(self) -> str:
5253
return "\n".join(f"- {field.name:<15} = {getattr(self, field.name)}"
@@ -64,10 +65,17 @@ def __init__(self, config: DeidentificationConfig = DeidentificationConfig()):
6465
"""
6566
self.config = config
6667
self.all_persons: list[dict] = []
68+
69+
# this combines all self.all_persons lists from multiple passes of self._find_all_persons()
70+
self.aggregate_persons: list[dict] = []
71+
6772
self.all_pronouns: list[dict] = []
6873
self.doc: Optional[Doc] = None
6974
self.table_class = None
7075

76+
# used by self.get_identified_elements()
77+
self.replaced_text = None
78+
7179
if self.config.debug:
7280
from veryprettytable import VeryPrettyTable
7381
self.table_class = VeryPrettyTable
@@ -134,6 +142,8 @@ def deidentify(self, text: str) -> str:
134142
self.all_pronouns = []
135143
merged = self._merge_metadata()
136144
replaced_text = self._replace_merged(replaced_text, merged)
145+
146+
self.replaced_text = replaced_text
137147
return replaced_text
138148

139149
def deidentify_with_wrapped_html(self, text: str, html_begin: str = HTML_BEGIN, html_end:str = HTML_END) -> str:
@@ -156,6 +166,10 @@ def deidentify_with_wrapped_html(self, text: str, html_begin: str = HTML_BEGIN,
156166
buffer.write(html_end)
157167
return buffer.getvalue()
158168

169+
def get_identified_elements(self) -> dict:
170+
elements = {"message": self.replaced_text, "entities": self.aggregate_persons, "pronouns": self.all_pronouns}
171+
return elements
172+
159173
def _find_all_persons(self) -> int:
160174
"""Find all person entities in the current document.
161175
@@ -179,6 +193,7 @@ def _find_all_persons(self) -> int:
179193
continue
180194
record = {"text": ent.text, "start_char": ent.start_char, "end_char": ent.end_char, "label": ent.label_, "shapes": [token.shape_ for token in ent]}
181195
self.all_persons.append(record)
196+
self.aggregate_persons.extend(self.all_persons)
182197
return len(self.all_persons)
183198

184199
def _find_all_pronouns(self) -> int:

deidentification/deidentification_constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
pgmName = "deidentification"
22
pgmUrl = "https://github.com/jftuga/deidentification"
3-
pgmVersion = "1.0.0"
3+
pgmVersion = "1.1.0"
44

55
GENDER_PRONOUNS = {
66
"he": "HE/SHE",
@@ -81,3 +81,4 @@ class bcolors:
8181
ENDC = '\033[0m'
8282
BOLD = '\033[1m'
8383
UNDERLINE = '\033[4m'
84+

deidentification/deidentify.py

Lines changed: 82 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,51 @@
33
"""Command line interface for the deidentification package."""
44

55
import argparse
6+
import json
7+
import os
68
import sys
9+
from io import StringIO
710
from typing import TextIO
811

912
from . import __version__
1013
from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle
14+
from .deidentification_constants import pgmUrl
15+
from .file_detection import read_file_with_detection
16+
17+
def create_json_filename(input_file: str) -> str:
18+
"""Creates the metadata JSON filename for a given input file.
19+
20+
Takes an input filename and creates a corresponding metadata filename
21+
by replacing the original extension with "--tokens.json".
22+
23+
Args:
24+
input_file (str): Original input file path
25+
26+
Returns:
27+
str: Path for the metadata JSON file.
28+
Example: "text.txt" -> "text--tokens.json"
29+
30+
Note:
31+
Preserves the original file path, only modifies the extension.
32+
"""
33+
filename, _ = os.path.splitext(input_file)
34+
return filename + "--tokens.json"
35+
36+
def save_elements(filename: str, elements: dict):
37+
"""Saves a dictionary of elements to a JSON file with UTF-8 encoding.
38+
39+
Args:
40+
filename (str): The base filename to save to. Will be converted to a JSON
41+
filename using create_json_filename().
42+
elements (dict): Dictionary containing the elements to save. Values that
43+
aren't JSON-serializable will be converted to strings.
44+
45+
Returns:
46+
None
47+
"""
48+
outfile = create_json_filename(filename)
49+
with open(outfile, "w", encoding="utf-8") as fp:
50+
fp.write(json.dumps(elements, indent=4, default=str))
1151

1252

1353
def process_stream(input_stream: TextIO, config: DeidentificationConfig) -> str:
@@ -18,14 +58,22 @@ def process_stream(input_stream: TextIO, config: DeidentificationConfig) -> str:
1858
config: DeidentificationConfig instance with processing settings
1959
2060
Returns:
21-
str: De-identified text
61+
str: The deidentified content as a string. If HTML output style is specified in the config,
62+
the content will include HTML markup around deidentified elements.
63+
64+
Note:
65+
If config.save_tokens is True, identified elements will be saved to a JSON file
66+
using the filename specified in the config.
2267
"""
2368
content = input_stream.read()
2469
deidentifier = Deidentification(config)
25-
26-
if config.output_style == DeidentificationOutputStyle.HTML:
27-
return deidentifier.deidentify_with_wrapped_html(content)
28-
return deidentifier.deidentify(content)
70+
71+
func = deidentifier.deidentify_with_wrapped_html if config.output_style == DeidentificationOutputStyle.HTML else deidentifier.deidentify
72+
content = func(content)
73+
if config.save_tokens:
74+
elements = deidentifier.get_identified_elements()
75+
save_elements(config.filename, elements)
76+
return content
2977

3078

3179
def main() -> int:
@@ -70,25 +118,49 @@ def main() -> int:
70118
"-v",
71119
"--version",
72120
action="version",
73-
version=f"%(prog)s {__version__}",
121+
version=f"%(prog)s {__version__} : {pgmUrl}",
74122
help="display program version and then exit"
75123
)
76-
124+
125+
parser.add_argument(
126+
"-d",
127+
"--debug",
128+
action="store_true",
129+
default=False,
130+
help="enable debug mode"
131+
)
132+
133+
parser.add_argument(
134+
"-t",
135+
"--tokens",
136+
action="store_true",
137+
default=False,
138+
help="save identified elements to file ending in `--tokens.json`"
139+
)
140+
77141
args = parser.parse_args()
78142

79143
# Configure deidentification settings
80144
config = DeidentificationConfig(
81145
replacement=args.replacement,
82-
output_style=DeidentificationOutputStyle.HTML if args.html else DeidentificationOutputStyle.TEXT
146+
output_style=DeidentificationOutputStyle.HTML if args.html else DeidentificationOutputStyle.TEXT,
147+
debug = args.debug == True,
148+
save_tokens = args.tokens == True
83149
)
84150

85151
try:
86152
# Handle input
153+
config.filename = args.input_file if args.input_file != "-" else "STDIN.txt"
154+
if args.debug:
155+
print(config, file=sys.stderr)
156+
87157
if args.input_file == "-":
88158
result = process_stream(sys.stdin, config)
89159
else:
90-
with open(args.input_file, "r", encoding="utf-8") as f:
91-
result = process_stream(f, config)
160+
file_contents, encoding = read_file_with_detection(args.input_file)
161+
if config.debug:
162+
print(f"DEBUG: Detected file encoding: {encoding}", file=sys.stderr)
163+
result = process_stream(StringIO(file_contents), config)
92164

93165
# Handle output
94166
if args.output:

deidentification/file_detection.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
2+
import chardet
3+
import sys
4+
5+
def read_file_with_detection(filename: str) -> tuple[str, str|None]:
6+
"""Detects file encoding and reads its contents in a single file read operation.
7+
8+
Opens the file once in binary mode, uses the bytes for encoding detection,
9+
then decodes those same bytes using the detected encoding.
10+
11+
Uses the chardet library to analyze the raw bytes of a file and determine its
12+
most likely character encoding (e.g., 'utf-8', 'ascii', 'windows-1252', etc.).
13+
14+
Args:
15+
filename: Path to the file to read.
16+
17+
Returns:
18+
A tuple containing (file_contents: str, detected_encoding: str).
19+
The file_contents will be decoded using the detected encoding.
20+
21+
Raises:
22+
FileNotFoundError: If the specified file does not exist.
23+
IOError: If there are issues reading the file.
24+
UnicodeDecodeError: If the content cannot be decoded with the detected encoding.
25+
"""
26+
with open(filename, 'rb') as file:
27+
raw_bytes = file.read()
28+
29+
detected_encoding = chardet.detect(raw_bytes)['encoding']
30+
all_encodings = (detected_encoding, "utf-8", "ascii", "cp1252", "latin1", "utf-16", "iso-8859-15", "iso-8859-1", "utf-32", "cp1251", "gb2312", "big5")
31+
decode_succeeded = False
32+
file_contents = ""
33+
for encoding in all_encodings:
34+
try:
35+
# print(f"Attempting file read with {encoding=} for {filename=}")
36+
file_contents = raw_bytes.decode(detected_encoding)
37+
decode_succeeded = True
38+
break
39+
except UnicodeDecodeError:
40+
continue
41+
42+
if not decode_succeeded:
43+
print(f"Error: Unable to detect file encoding for: {filename=}", file=sys.stderr)
44+
return "", None
45+
46+
return file_contents, detected_encoding
47+

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
[build-system]
22
requires = ["setuptools>=45", "wheel"]
3-
build-backend = "setuptools.build_meta"
3+
build-backend = "setuptools.build_meta"
4+
5+
[tool.black]
6+
line-length = 140
7+
target-version = ['py312']
8+
skip_magic_trailing_comma = true

0 commit comments

Comments
 (0)