Skip to content

Commit 5c3899f

Browse files
authored
Merge pull request #1255 from bact/add-more-tests
Add more NER
2 parents 55168fe + 85dbcef commit 5c3899f

File tree

4 files changed

+35
-23
lines changed

4 files changed

+35
-23
lines changed

pythainlp/tag/named_entity.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ class NER:
1616
:param str corpus: corpus
1717
1818
**Options for engine**
19-
* *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default)
19+
* *phayathaibert* - PhayaThaiBERT-based Thai NER engine
2020
* *thainer* - Thai NER engine
21+
* *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default)
2122
* *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.
23+
* *wangchanberta* - WangchanBERTa-based Thai NER engine
2224
2325
**Options for corpus**
2426
* *thainer* - Thai NER corpus (default)
27+
* *thainer-v2* - Thai NER v2 corpus
2528
2629
**Note**: The tltk engine supports NER models from tltk only.
2730
"""
@@ -34,29 +37,33 @@ def __init__(
3437
def load_engine(self, engine: str, corpus: str) -> None:
3538
self.name_engine = engine
3639
self.engine: Any = None
37-
if engine == "thainer" and corpus == "thainer":
38-
from pythainlp.tag.thainer import ThaiNameTagger
40+
if corpus == "thainer":
41+
if engine == "thainer":
42+
from pythainlp.tag.thainer import ThaiNameTagger
3943

40-
self.engine = ThaiNameTagger()
41-
elif engine == "thainer-v2" and corpus == "thainer":
42-
from pythainlp.wangchanberta import NamedEntityRecognition
44+
self.engine = ThaiNameTagger()
45+
elif engine == "thainer-v2":
46+
from pythainlp.wangchanberta import NamedEntityRecognition
4347

44-
self.engine = NamedEntityRecognition(
45-
model="pythainlp/thainer-corpus-v2-base-model"
46-
)
47-
elif engine == "tltk":
48-
from pythainlp.tag import tltk
48+
self.engine = NamedEntityRecognition(
49+
model="pythainlp/thainer-corpus-v2-base-model"
50+
)
51+
elif engine == "wangchanberta":
52+
from pythainlp.wangchanberta import ThaiNameTagger as WangchanbertaThaiNameTagger # type: ignore[assignment] # noqa: I001,E501
53+
54+
self.engine = WangchanbertaThaiNameTagger(dataset_name=corpus) # type: ignore[call-arg]
55+
elif corpus == "thainer-v2":
56+
if engine == "phayathaibert":
57+
from pythainlp.phayathaibert.core import NamedEntityTagger
4958

50-
self.engine = tltk
51-
elif engine == "wangchanberta" and corpus == "thainer":
52-
from pythainlp.wangchanberta import ThaiNameTagger # type: ignore[assignment] # noqa: I001
59+
self.engine = NamedEntityTagger()
60+
else: # No corpus matched
61+
if engine == "tltk":
62+
from pythainlp.tag import tltk
5363

54-
self.engine = ThaiNameTagger(dataset_name=corpus) # type: ignore[call-arg]
55-
elif engine == "phayathaibert" and corpus == "thainer-v2":
56-
from pythainlp.phayathaibert.core import NamedEntityTagger
64+
self.engine = tltk
5765

58-
self.engine = NamedEntityTagger()
59-
else:
66+
if self.engine is None:
6067
raise ValueError(
6168
f"NER class not support {engine} engine or {corpus} corpus."
6269
)

pythainlp/util/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
"reorder_vowels",
4646
"rhyme",
4747
"sound_syllable",
48+
"spell_syllable",
49+
"spell_word",
4850
"spelling",
4951
"spell_words",
5052
"syllable_length",
@@ -121,6 +123,7 @@
121123
from pythainlp.util.remove_trailing_repeat_consonants import (
122124
remove_trailing_repeat_consonants,
123125
)
126+
from pythainlp.util.spell_words import spell_syllable, spell_word
124127
from pythainlp.util.strftime import thai_strftime
125128

126129
from pythainlp.util.thai import (

tests/compact/testc_util.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66

77
import unittest
88

9-
from pythainlp.util import rhyme, thai_word_tone_detector
10-
from pythainlp.util.spell_words import spell_word
9+
from pythainlp.util import rhyme, spell_word, thai_word_tone_detector
1110

1211

1312
class SpellWordTestCaseC(unittest.TestCase):

tests/core/test_tag.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,12 @@ def test_pos_tag(self):
9090
)
9191

9292
def test_NER_error_handling(self):
93-
# Test error handling for invalid engine/corpus combination
9493
with self.assertRaises(ValueError):
95-
NER(engine="thainer", corpus="cat")
94+
NER(engine="xx_non_existing", corpus="thainer")
95+
with self.assertRaises(ValueError):
96+
NER(engine="xx_non_existing", corpus="thainer-v2")
97+
with self.assertRaises(ValueError):
98+
NER(engine="xx_non_existing", corpus="xx_non_existing")
9699

97100

98101
class PerceptronTaggerTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)