1111 longest ,
1212 multi_cut ,
1313 newmm ,
14+ paragraph_tokenize ,
1415 sent_tokenize ,
1516 subword_tokenize ,
1617 syllable_tokenize ,
2223)
2324from pythainlp .util import dict_trie
2425
26+ from ..test_helpers import assert_segment_handles_none_and_empty
27+
2528TEXT_1 = "หมอนทองตากลมหูว์MBK39 :.ฉฺ๐๐๓-#™±"
2629TEXT_2 = "ทดสอบ"
2730
@@ -231,7 +234,7 @@ def test_word_detokenize(self):
231234 )
232235
233236 def test_numeric_data_format (self ):
234- engines = ["newmm" ]
237+ engines = ["newmm" , "longest" ]
235238
236239 for engine in engines :
237240 self .assertIn (
@@ -257,6 +260,35 @@ def test_numeric_data_format(self):
257260 self .assertIn ("2.5:1" , tokens )
258261 self .assertIn ("5:2" , tokens )
259262
263+ # Test join_broken_num parameter (defaults to True)
264+ # When True, numeric data should be preserved
265+ engine = "longest"
266+ self .assertIn (
267+ "127.0.0.1" ,
268+ word_tokenize (
269+ "ไอพีของคุณคือ 127.0.0.1 ครับ" ,
270+ engine = engine ,
271+ join_broken_num = True ,
272+ ),
273+ )
274+ # When False, numbers may be broken up
275+ self .assertNotIn (
276+ "127.0.0.1" ,
277+ word_tokenize (
278+ "ไอพีของคุณคือ 127.0.0.1 ครับ" ,
279+ engine = engine ,
280+ join_broken_num = False ,
281+ ),
282+ )
283+ self .assertNotIn (
284+ "1,234,567.89" ,
285+ word_tokenize (
286+ "รางวัลมูลค่า 1,234,567.89 บาท" ,
287+ engine = engine ,
288+ join_broken_num = False ,
289+ ),
290+ )
291+
260292
261293class TokenizeTestCase (unittest .TestCase ):
262294 def test_Tokenizer (self ):
@@ -361,8 +393,7 @@ def test_word_tokenize(self):
361393 )
362394
363395 def test_etcc (self ):
364- self .assertEqual (etcc .segment (None ), [])
365- self .assertEqual (etcc .segment ("" ), [])
396+ assert_segment_handles_none_and_empty (self , etcc .segment )
366397 self .assertIsInstance (etcc .segment ("คืนความสุข" ), list )
367398 self .assertEqual (
368399 etcc .segment ("หาเงินเพื่อเรียน" ),
@@ -377,8 +408,7 @@ def test_etcc(self):
377408 )
378409
379410 def test_longest (self ):
380- self .assertEqual (longest .segment (None ), [])
381- self .assertEqual (longest .segment ("" ), [])
411+ assert_segment_handles_none_and_empty (self , longest .segment )
382412 self .assertIsInstance (
383413 longest .segment ("กรุงเทพฯมากๆเพราโพาง BKKฯ" ), list
384414 )
@@ -430,8 +460,7 @@ def test_longest_custom_dict(self):
430460 )
431461
432462 def test_mm (self ):
433- self .assertEqual (multi_cut .segment (None ), [])
434- self .assertEqual (multi_cut .segment ("" ), [])
463+ assert_segment_handles_none_and_empty (self , multi_cut .segment )
435464 self .assertIsNotNone (multi_cut .segment ("ตัด" , dict_trie (["" ])))
436465
437466 self .assertEqual (word_tokenize ("" , engine = "mm" ), [])
@@ -468,8 +497,7 @@ def test_mm(self):
468497 self .assertEqual (multi_cut .find_all_segment (None ), [])
469498
470499 def test_newmm (self ):
471- self .assertEqual (newmm .segment (None ), [])
472- self .assertEqual (newmm .segment ("" ), [])
500+ assert_segment_handles_none_and_empty (self , newmm .segment )
473501 self .assertEqual (
474502 word_tokenize ("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" , engine = "newmm" ),
475503 ["ฉัน" , "รัก" , "ภาษาไทย" , "เพราะ" , "ฉัน" , "เป็น" , "คนไทย" ],
@@ -556,8 +584,7 @@ def test_newmm_dangertext(self):
556584 )
557585
558586 def test_tcc (self ):
559- self .assertEqual (tcc .segment (None ), [])
560- self .assertEqual (tcc .segment ("" ), [])
587+ assert_segment_handles_none_and_empty (self , tcc .segment )
561588 self .assertEqual (
562589 tcc .segment ("ประเทศไทย" ), ["ป" , "ระ" , "เท" , "ศ" , "ไท" , "ย" ]
563590 )
@@ -616,8 +643,7 @@ def test_tcc(self):
616643 self .assertEqual (tcc .tcc_pos ("" ), set ())
617644
618645 def test_tcc_p (self ):
619- self .assertEqual (tcc_p .segment (None ), [])
620- self .assertEqual (tcc_p .segment ("" ), [])
646+ assert_segment_handles_none_and_empty (self , tcc_p .segment )
621647 self .assertEqual (
622648 tcc_p .segment ("ประเทศไทย" ), ["ป" , "ระ" , "เท" , "ศ" , "ไท" , "ย" ]
623649 )
@@ -652,3 +678,12 @@ def test_display_cell_tokenize(self):
652678 self .assertEqual (display_cell_tokenize ("สวัสดี" ), ['ส' , 'วั' , 'ส' , 'ดี' ])
653679 self .assertEqual (display_cell_tokenize ("ทดสอบ" ), ["ท" , "ด" , "ส" , "อ" , "บ" ])
654680 self .assertEqual (display_cell_tokenize ("ภาษาไทย" ), ["ภ" , "า" , "ษ" , "า" , "ไ" , "ท" , "ย" ])
681+
682+ def test_paragraph_tokenize (self ):
683+ # Test error handling for invalid engine
684+ text = (
685+ "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
686+ "จากผลงานวิจัยที่เคยทำมาในอดีต"
687+ )
688+ with self .assertRaises (ValueError ):
689+ paragraph_tokenize (text , engine = "non-existent-engine" )
0 commit comments