Skip to content

Commit 55168fe

Browse files
authored
Merge pull request #1253 from PyThaiNLP/copilot/increase-test-coverage
Move tltk tests from noauto to extra tier
2 parents 78d3ee4 + cd12517 commit 55168fe

16 files changed

+284
-195
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ classifiers = [
5050
"Programming Language :: Python :: 3.11",
5151
"Programming Language :: Python :: 3.12",
5252
"Programming Language :: Python :: 3.13",
53+
"Programming Language :: Python :: 3.14",
5354
"Intended Audience :: Developers",
5455
"Natural Language :: Thai",
5556
"Topic :: Scientific/Engineering :: Artificial Intelligence",
@@ -186,6 +187,7 @@ extra = [
186187
"pandas>=0.24",
187188
"ssg>=0.0.8",
188189
"symspellpy>=6.7.6",
190+
"tltk>=1.10",
189191
]
190192

191193
# Full dependencies - pinned where available

tests/compact/testc_tokenize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
SENT_4,
2424
TEXT_1,
2525
)
26+
from ..test_helpers import assert_segment_handles_none_and_empty
2627

2728

2829
class SentTokenizeCRFCutTestCaseC(unittest.TestCase):
@@ -79,8 +80,7 @@ def test_subword_tokenize(self):
7980

8081
class WordTokenizeICUTestCaseC(unittest.TestCase):
8182
def test_icu(self):
82-
self.assertEqual(pyicu.segment(None), [])
83-
self.assertEqual(pyicu.segment(""), [])
83+
assert_segment_handles_none_and_empty(self, pyicu.segment)
8484
self.assertEqual(
8585
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
8686
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],

tests/compact/testc_util.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
# SPDX-FileType: SOURCE
33
# SPDX-License-Identifier: Apache-2.0
44

5-
"""Unit tests for pythainlp.util module.
6-
"""
5+
"""Unit tests for pythainlp.util module."""
76

87
import unittest
98

tests/core/test_robustness.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,4 +253,3 @@ def test_word_tokenize_with_very_long_strings(self):
253253
f"word_tokenize (engine={engine}) failed with "
254254
f"very long string (index={i}): {e}"
255255
)
256-

tests/core/test_tag.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from os import path
77

88
from pythainlp.tag import (
9+
NER,
910
PerceptronTagger,
1011
perceptron,
1112
pos_tag,
@@ -88,6 +89,11 @@ def test_pos_tag(self):
8889
],
8990
)
9091

92+
def test_NER_error_handling(self):
93+
# Test error handling for invalid engine/corpus combination
94+
with self.assertRaises(ValueError):
95+
NER(engine="thainer", corpus="cat")
96+
9197

9298
class PerceptronTaggerTestCase(unittest.TestCase):
9399
"""Test pythainlp.tag.PerceptronTagger

tests/core/test_tokenize.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
longest,
1212
multi_cut,
1313
newmm,
14+
paragraph_tokenize,
1415
sent_tokenize,
1516
subword_tokenize,
1617
syllable_tokenize,
@@ -22,6 +23,8 @@
2223
)
2324
from pythainlp.util import dict_trie
2425

26+
from ..test_helpers import assert_segment_handles_none_and_empty
27+
2528
TEXT_1 = "หมอนทองตากลมหูว์MBK39 :.ฉฺ๐๐๓-#™±"
2629
TEXT_2 = "ทดสอบ"
2730

@@ -231,7 +234,7 @@ def test_word_detokenize(self):
231234
)
232235

233236
def test_numeric_data_format(self):
234-
engines = ["newmm"]
237+
engines = ["newmm", "longest"]
235238

236239
for engine in engines:
237240
self.assertIn(
@@ -257,6 +260,35 @@ def test_numeric_data_format(self):
257260
self.assertIn("2.5:1", tokens)
258261
self.assertIn("5:2", tokens)
259262

263+
# Test join_broken_num parameter (defaults to True)
264+
# When True, numeric data should be preserved
265+
engine = "longest"
266+
self.assertIn(
267+
"127.0.0.1",
268+
word_tokenize(
269+
"ไอพีของคุณคือ 127.0.0.1 ครับ",
270+
engine=engine,
271+
join_broken_num=True,
272+
),
273+
)
274+
# When False, numbers may be broken up
275+
self.assertNotIn(
276+
"127.0.0.1",
277+
word_tokenize(
278+
"ไอพีของคุณคือ 127.0.0.1 ครับ",
279+
engine=engine,
280+
join_broken_num=False,
281+
),
282+
)
283+
self.assertNotIn(
284+
"1,234,567.89",
285+
word_tokenize(
286+
"รางวัลมูลค่า 1,234,567.89 บาท",
287+
engine=engine,
288+
join_broken_num=False,
289+
),
290+
)
291+
260292

261293
class TokenizeTestCase(unittest.TestCase):
262294
def test_Tokenizer(self):
@@ -361,8 +393,7 @@ def test_word_tokenize(self):
361393
)
362394

363395
def test_etcc(self):
364-
self.assertEqual(etcc.segment(None), [])
365-
self.assertEqual(etcc.segment(""), [])
396+
assert_segment_handles_none_and_empty(self, etcc.segment)
366397
self.assertIsInstance(etcc.segment("คืนความสุข"), list)
367398
self.assertEqual(
368399
etcc.segment("หาเงินเพื่อเรียน"),
@@ -377,8 +408,7 @@ def test_etcc(self):
377408
)
378409

379410
def test_longest(self):
380-
self.assertEqual(longest.segment(None), [])
381-
self.assertEqual(longest.segment(""), [])
411+
assert_segment_handles_none_and_empty(self, longest.segment)
382412
self.assertIsInstance(
383413
longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"), list
384414
)
@@ -430,8 +460,7 @@ def test_longest_custom_dict(self):
430460
)
431461

432462
def test_mm(self):
433-
self.assertEqual(multi_cut.segment(None), [])
434-
self.assertEqual(multi_cut.segment(""), [])
463+
assert_segment_handles_none_and_empty(self, multi_cut.segment)
435464
self.assertIsNotNone(multi_cut.segment("ตัด", dict_trie([""])))
436465

437466
self.assertEqual(word_tokenize("", engine="mm"), [])
@@ -468,8 +497,7 @@ def test_mm(self):
468497
self.assertEqual(multi_cut.find_all_segment(None), [])
469498

470499
def test_newmm(self):
471-
self.assertEqual(newmm.segment(None), [])
472-
self.assertEqual(newmm.segment(""), [])
500+
assert_segment_handles_none_and_empty(self, newmm.segment)
473501
self.assertEqual(
474502
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
475503
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
@@ -556,8 +584,7 @@ def test_newmm_dangertext(self):
556584
)
557585

558586
def test_tcc(self):
559-
self.assertEqual(tcc.segment(None), [])
560-
self.assertEqual(tcc.segment(""), [])
587+
assert_segment_handles_none_and_empty(self, tcc.segment)
561588
self.assertEqual(
562589
tcc.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"]
563590
)
@@ -616,8 +643,7 @@ def test_tcc(self):
616643
self.assertEqual(tcc.tcc_pos(""), set())
617644

618645
def test_tcc_p(self):
619-
self.assertEqual(tcc_p.segment(None), [])
620-
self.assertEqual(tcc_p.segment(""), [])
646+
assert_segment_handles_none_and_empty(self, tcc_p.segment)
621647
self.assertEqual(
622648
tcc_p.segment("ประเทศไทย"), ["ป", "ระ", "เท", "ศ", "ไท", "ย"]
623649
)
@@ -652,3 +678,12 @@ def test_display_cell_tokenize(self):
652678
self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
653679
self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
654680
self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])
681+
682+
def test_paragraph_tokenize(self):
683+
# Test error handling for invalid engine
684+
text = (
685+
"(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
686+
"จากผลงานวิจัยที่เคยทำมาในอดีต"
687+
)
688+
with self.assertRaises(ValueError):
689+
paragraph_tokenize(text, engine="non-existent-engine")

tests/extra/testx_augment.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ def setUp(self):
2222
self.text2 = "เราอยู่ที่มหาวิทยาลัยขอนแก่น"
2323

2424
def test_WordNetAug(self):
25-
nltk.download('omw-1.4', force=True) # load wordnet
25+
nltk.download("omw-1.4", force=True) # load wordnet
2626
wordnetaug = WordNetAug()
2727
self.assertIsNotNone(wordnetaug.augment(self.text))
2828
self.assertIsNotNone(wordnetaug.find_synonyms("ผม", pos=None))
2929
self.assertIsNotNone(wordnetaug.augment(self.text, postag=False))
30-
self.assertIsNone(postype2wordnet('n', 'abc'))
31-
self.assertIsNotNone(postype2wordnet('NOUN', 'orchid'))
30+
self.assertIsNone(postype2wordnet("n", "abc"))
31+
self.assertIsNotNone(postype2wordnet("NOUN", "orchid"))
3232

3333
# def test_Thai2fitAug(self):
3434
# _aug = Thai2fitAug()

tests/extra/testx_spell.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
# Tests for spell functions that need extra dependencies
6-
# Note: Tests requiring phunspell/tltk/torch/HuggingFace Hub have been moved to tests.noauto
76

87
import unittest
98

@@ -20,7 +19,6 @@
2019

2120
class SpellTestCaseX(unittest.TestCase):
2221
def test_spell(self):
23-
# Tests for symspellpy only (phunspell and tltk moved to noauto)
2422
result = spell("เน้ร", engine="symspellpy")
2523
self.assertIsInstance(result, list)
2624
self.assertGreater(len(result), 0)
@@ -30,16 +28,26 @@ def test_spell(self):
3028
self.assertGreater(len(result), 0)
3129

3230
def test_word_correct(self):
33-
# Tests for symspellpy only (phunspell and wanchanberta moved to noauto)
3431
result = correct("ทดสอง", engine="symspellpy")
3532
self.assertIsInstance(result, str)
3633
self.assertNotEqual(result, "")
3734

3835
def test_spell_sent(self):
39-
# Tests for symspellpy only (phunspell moved to noauto)
4036
self.assertIsNotNone(spell_sent(SENT_TOKS, engine="symspellpy"))
4137

4238
def test_correct_sent(self):
43-
# Tests for symspellpy only (phunspell and wanchanberta moved to noauto)
4439
self.assertIsNotNone(correct_sent(SENT_TOKS, engine="symspellpy"))
4540
self.assertIsNotNone(symspellpy.correct_sent(SENT_TOKS))
41+
42+
43+
class SpellTLTKTestCaseX(unittest.TestCase):
44+
"""Tests for tltk engine spell checking"""
45+
46+
def test_spell_tltk(self):
47+
result = spell("เน้ร", engine="tltk")
48+
self.assertIsInstance(result, list)
49+
self.assertGreater(len(result), 0)
50+
51+
result = spell("เดก", engine="tltk")
52+
self.assertIsInstance(result, list)
53+
self.assertGreater(len(result), 0)

tests/extra/testx_tag.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,14 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
# Tests for tag functions that need extra dependencies
6-
# Note: Tests requiring transformers/tltk have been moved to tests.noautotest
76

87
import unittest
98

9+
from pythainlp.tag import pos_tag, tltk
1010
from pythainlp.tag.thainer import ThaiNameTagger
1111

1212

1313
class TagTestCaseX(unittest.TestCase):
14-
# Tests for ThaiNameTagger (doesn't require transformers or tltk)
15-
# All tltk and transformers-based tests have been moved to tests.noautotest
1614

1715
def test_thai_name_tagger_1_5(self):
1816
ner = ThaiNameTagger(version="1.5")
@@ -117,3 +115,39 @@ def test_thai_name_tagger_1_4(self):
117115
)
118116
)
119117

118+
119+
class TagTLTKTestCaseX(unittest.TestCase):
120+
"""Tests for tltk engine POS tagging and NER"""
121+
122+
def test_pos_tag_tltk(self):
123+
tokens = ["ผม", "รัก", "คุณ"]
124+
self.assertIsNotNone(pos_tag(tokens, engine="tltk"))
125+
with self.assertRaises(ValueError):
126+
tltk.pos_tag(tokens, corpus="blackboard")
127+
128+
def test_tltk_ner(self):
129+
self.assertEqual(tltk.get_ner(""), [])
130+
self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า"))
131+
self.assertIsNotNone(tltk.get_ner("แมวทำอะไรตอนห้าโมงเช้า", pos=False))
132+
self.assertIsNotNone(
133+
tltk.get_ner("พลเอกประยุกธ์ จันทร์โอชา ประกาศในฐานะหัวหน้า")
134+
)
135+
self.assertIsNotNone(
136+
tltk.get_ner(
137+
"พลเอกประยุกธ์ จันทร์โอชา ประกาศในฐานะหัวหน้า",
138+
tag=True,
139+
)
140+
)
141+
self.assertIsNotNone(
142+
tltk.get_ner(
143+
"""คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น
144+
จังหวัดหนองคาย 43000"""
145+
)
146+
)
147+
self.assertIsNotNone(
148+
tltk.get_ner(
149+
"""คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น
150+
จังหวัดหนองคาย 43000""",
151+
tag=True,
152+
)
153+
)

0 commit comments

Comments
 (0)