Skip to content

Commit f22c110

Browse files
authored
Merge pull request #1280 from PyThaiNLP/copilot/refactor-any-type-annotations
Remove redundant type annotation reassignments
2 parents 8a188f2 + cc24a14 commit f22c110

File tree

22 files changed

+453
-178
lines changed

22 files changed

+453
-178
lines changed

build_tools/analysis/output/type_hint_analysis.json

Lines changed: 285 additions & 38 deletions
Large diffs are not rendered by default.

pythainlp/augment/wordnet.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212

1313
import itertools
1414
from collections import OrderedDict
15-
from typing import Any, Callable, Optional
15+
from typing import Callable, Optional
1616

1717
from nltk.corpus import wordnet as wn
18+
from nltk.corpus.reader.wordnet import Synset
1819

1920
from pythainlp.corpus import wordnet
2021
from pythainlp.tag import pos_tag
@@ -121,7 +122,7 @@ class WordNetAug:
121122
synonyms: list[str]
122123
list_synsets: list
123124
p2w_pos: Optional[str]
124-
synset: Any
125+
synset: Synset
125126
syn: str
126127
synonyms_without_duplicates: list[str]
127128
list_words: list[str]

pythainlp/benchmarks/word_tokenization.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import re
77
import sys
8-
from typing import TYPE_CHECKING, Any
8+
from typing import TYPE_CHECKING, Union
99

1010
if TYPE_CHECKING:
1111
import numpy as np
@@ -42,7 +42,7 @@ def _f1(precision: float, recall: float) -> float:
4242
return 2 * precision * recall / (precision + recall)
4343

4444

45-
def _flatten_result(my_dict: dict, sep: str = ":") -> dict[str, Any]:
45+
def _flatten_result(my_dict: dict, sep: str = ":") -> dict[str, Union[int, str]]:
4646
"""Flatten two-dimension dictionary.
4747
4848
Use keys in the first dimension as a prefix for keys in the second dimension.
@@ -56,7 +56,7 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict[str, Any]:
5656
:param str sep: separator between the two keys (default: ":")
5757
5858
:return: a one-dimension dictionary with keys combined
59-
:rtype: dict[str, Any]
59+
:rtype: dict[str, Union[int, str]]
6060
"""
6161
return {
6262
f"{k1}{sep}{k2}": v
@@ -133,7 +133,7 @@ def preprocessing(txt: str, remove_space: bool = True) -> str:
133133
return txt
134134

135135

136-
def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, Any]:
136+
def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, dict[str, Union[int, str]]]:
137137
"""Compute statistics for tokenization quality
138138
139139
These statistics include:
@@ -150,7 +150,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, Any]:
150150
:param str samples: samples that we want to evaluate
151151
152152
:return: metrics at character- and word-level and indicators of correctly tokenized words
153-
:rtype: dict[str, Any]
153+
:rtype: dict[str, dict[str, Union[int, str]]]
154154
"""
155155
import numpy as np
156156

@@ -166,11 +166,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, Any]:
166166
c_pos_pred = c_pos_pred[c_pos_pred < ref_sample_arr.shape[0]]
167167
c_neg_pred = c_neg_pred[c_neg_pred < ref_sample_arr.shape[0]]
168168

169-
c_tp: np.intp = np.sum(ref_sample_arr[c_pos_pred] == 1)
170-
c_fp: np.intp = np.sum(ref_sample_arr[c_pos_pred] == 0)
169+
c_tp: int = int(np.sum(ref_sample_arr[c_pos_pred] == 1))
170+
c_fp: int = int(np.sum(ref_sample_arr[c_pos_pred] == 0))
171171

172-
c_tn: np.intp = np.sum(ref_sample_arr[c_neg_pred] == 0)
173-
c_fn: np.intp = np.sum(ref_sample_arr[c_neg_pred] == 1)
172+
c_tn: int = int(np.sum(ref_sample_arr[c_neg_pred] == 0))
173+
c_fn: int = int(np.sum(ref_sample_arr[c_neg_pred] == 1))
174174

175175
# Compute word-level statistics
176176

@@ -183,7 +183,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, Any]:
183183
word_boundaries, ss_boundaries
184184
)
185185

186-
correctly_tokenised_words: np.intp = np.sum(tokenization_indicators)
186+
correctly_tokenised_words: int = int(np.sum(tokenization_indicators))
187187

188188
tokenization_indicators_str = list(map(str, tokenization_indicators))
189189

@@ -196,8 +196,8 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict[str, Any]:
196196
},
197197
"word_level": {
198198
"correctly_tokenised_words": correctly_tokenised_words,
199-
"total_words_in_sample": np.sum(sample_arr),
200-
"total_words_in_ref_sample": np.sum(ref_sample_arr),
199+
"total_words_in_sample": int(np.sum(sample_arr)),
200+
"total_words_in_ref_sample": int(np.sum(ref_sample_arr)),
201201
},
202202
"global": {
203203
"tokenisation_indicators": "".join(tokenization_indicators_str)

pythainlp/chat/core.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,21 @@
33
# SPDX-License-Identifier: Apache-2.0
44
from __future__ import annotations
55

6-
from typing import TYPE_CHECKING, Any, Optional, cast
6+
from typing import TYPE_CHECKING, Optional, cast
77

88
if TYPE_CHECKING:
99
import torch
1010

11+
from pythainlp.generate.wangchanglm import WangChanGLM
12+
1113

1214
class ChatBotModel:
1315
history: list[tuple[str, str]]
14-
model: Any
16+
model: "WangChanGLM"
1517

1618
def __init__(self) -> None:
1719
"""Chat using AI generation"""
18-
self.history: list[tuple[str, str]] = []
20+
self.history = []
1921

2022
def reset_chat(self) -> None:
2123
"""Reset chat by cleaning history"""
@@ -49,7 +51,7 @@ def load_model(
4951
if model_name == "wangchanglm":
5052
from pythainlp.generate.wangchanglm import WangChanGLM
5153

52-
self.model: Any = WangChanGLM()
54+
self.model = WangChanGLM()
5355
self.model.load_model(
5456
model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded",
5557
return_dict=return_dict,

pythainlp/classify/param_free.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def __init__(
3535
if model_path:
3636
self.load(model_path)
3737
else:
38-
self.training_data: "NDArray[Any]" = np.array(training_data)
39-
self.cx2_list: list[int] = self.train()
38+
self.training_data = np.array(training_data)
39+
self.cx2_list = self.train()
4040

4141
def train(self) -> list[int]:
4242
temp_list = []
@@ -112,7 +112,5 @@ def load(self, path: str) -> None:
112112

113113
with open(path, "r", encoding="utf-8") as f:
114114
data = json.load(f)
115-
self.cx2_list: list[int] = data["cx2_list"]
116-
self.training_data: "NDArray[Any]" = np.array(
117-
data["training_data"]
118-
)
115+
self.cx2_list = data["cx2_list"]
116+
self.training_data = np.array(data["training_data"])

pythainlp/cli/tag.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from __future__ import annotations
77

88
import argparse
9-
from typing import TYPE_CHECKING, Any
9+
from typing import TYPE_CHECKING
1010

1111
from pythainlp import cli
1212
from pythainlp.tag import pos_tag
@@ -38,7 +38,7 @@ def __init__(self, name: str, argv: Sequence[str]) -> None:
3838
)
3939

4040
args = parser.parse_args(argv)
41-
self.args: Any = args
41+
self.args: argparse.Namespace = args
4242

4343
tokens = args.text.split(args.separator)
4444
result = self.run(tokens)

pythainlp/cli/tokenize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def __init__(self, name: str, argv: Sequence[str]) -> None:
7474
parser.set_defaults(keep_whitespace=True)
7575

7676
args = parser.parse_args(argv)
77-
self.args: Any = args
77+
self.args: argparse.Namespace = args
7878

7979
cli.exit_if_empty(args.text, parser)
8080
result = self.run(

pythainlp/coref/_fastcoref.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,21 @@
33
# SPDX-License-Identifier: Apache-2.0
44
from __future__ import annotations
55

6-
from typing import TYPE_CHECKING, Any, Optional
6+
from typing import TYPE_CHECKING, Optional, TypedDict
77

88
if TYPE_CHECKING:
9-
from fastcoref.modeling import CorefModel
9+
from fastcoref.modeling import CorefModel, CorefResult
1010
from spacy.language import Language
1111

1212

13+
class CorefResultDict(TypedDict):
14+
"""Dictionary representation of coreference resolution results."""
15+
16+
text: str
17+
clusters_string: list[list[str]]
18+
clusters: list[list[tuple[int, int]]]
19+
20+
1321
class FastCoref:
1422
def __init__(
1523
self,
@@ -34,14 +42,14 @@ def __init__(
3442
self.model_name, device=device, nlp=self.nlp
3543
)
3644

37-
def _to_json(self, _predict: Any) -> dict[str, Any]:
45+
def _to_json(self, _predict: "CorefResult") -> CorefResultDict:
3846
return {
3947
"text": _predict.text,
4048
"clusters_string": _predict.get_clusters(as_strings=True),
4149
"clusters": _predict.get_clusters(as_strings=False),
4250
}
4351

44-
def predict(self, texts: list[str]) -> list[dict]:
52+
def predict(self, texts: list[str]) -> list[CorefResultDict]:
4553
return [
4654
self._to_json(pred) for pred in self.model.predict(texts=texts)
4755
]

pythainlp/corpus/core.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ class _ResponseWrapper:
3838
_content: bytes
3939

4040
def __init__(self, response: HTTPResponse) -> None:
41-
self.status_code: int = response.status
42-
self.headers: HTTPMessage = response.headers
43-
self._content: bytes = response.read()
41+
self.status_code = response.status
42+
self.headers = response.headers
43+
self._content = response.read()
4444

4545
def json(self) -> dict[str, Any]:
4646
"""Parse JSON content from response."""

pythainlp/generate/thai2fit.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from typing import TYPE_CHECKING, Any, Union
1717

1818
if TYPE_CHECKING:
19+
from pathlib import Path
20+
1921
import pandas as pd
22+
from fastai.basic_train import Learner
2023
from fastai.text import (
2124
AWD_LSTM,
2225
LMDataBunch,
@@ -48,7 +51,7 @@
4851
)
4952

5053
# get dummy data
51-
imdb: Any = untar_data(URLs.IMDB_SAMPLE)
54+
imdb: "Path" = untar_data(URLs.IMDB_SAMPLE)
5255
dummy_df: "pd.DataFrame" = pd.read_csv(imdb / "texts.csv")
5356

5457
# get vocab
@@ -113,7 +116,7 @@
113116
"beta": 1,
114117
}
115118

116-
learn: Any = language_model_learner(
119+
learn: "Learner" = language_model_learner(
117120
data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args
118121
)
119122

0 commit comments

Comments
 (0)