11"""
22
3- Icegrams: A trigrams library for Icelandic
3+ Icegrams: A trigrams library for Icelandic
44
5- trie.py
5+ trie.py
66
7- Copyright (C) 2024 Miðeind ehf.
8- Original author: Vilhjálmur Þorsteinsson
7+ Copyright (C) 2020-2025 Miðeind ehf.
8+ Original author: Vilhjálmur Þorsteinsson
99
10- This software is licensed under the MIT License:
10+ This software is licensed under the MIT License:
1111
12- Permission is hereby granted, free of charge, to any person
13- obtaining a copy of this software and associated documentation
14- files (the "Software"), to deal in the Software without restriction,
15- including without limitation the rights to use, copy, modify, merge,
16- publish, distribute, sublicense, and/or sell copies of the Software,
17- and to permit persons to whom the Software is furnished to do so,
18- subject to the following conditions:
12+ Permission is hereby granted, free of charge, to any person
13+ obtaining a copy of this software and associated documentation
14+ files (the "Software"), to deal in the Software without restriction,
15+ including without limitation the rights to use, copy, modify, merge,
16+ publish, distribute, sublicense, and/or sell copies of the Software,
17+ and to permit persons to whom the Software is furnished to do so,
18+ subject to the following conditions:
1919
20- The above copyright notice and this permission notice shall be
21- included in all copies or substantial portions of the Software.
20+ The above copyright notice and this permission notice shall be
21+ included in all copies or substantial portions of the Software.
2222
23- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3030
3131
32- This module encapsulated the unigram trie logic used
33- by ngrams.py to compress the unigram set and map
34- unigrams to integer ids.
32+ This module encapsulated the unigram trie logic used
33+ by ngrams.py to compress the unigram set and map
34+ unigrams to integer ids.
3535
36- Trie lookup is implemented in trie.cpp.
36+ Trie lookup is implemented in trie.cpp.
3737
3838"""
3939
4848
4949
5050class _Node :
51-
52- """ A Node within a Trie """
51+ """A Node within a Trie"""
5352
5453 __slots__ = ("fragment" , "value" , "children" )
5554
@@ -61,7 +60,7 @@ def __init__(self, fragment: bytes, value: Optional[int]) -> None:
6160 self .children : Optional [List [_Node ]] = None
6261
6362 def add (self , fragment : bytes , value : int ) -> Optional [int ]:
64- """ Add the given remaining key fragment to this node """
63+ """Add the given remaining key fragment to this node"""
6564 if len (fragment ) == 0 :
6665 if self .value is not None :
6766 # This key already exists: return its value
@@ -151,8 +150,8 @@ def add(self, fragment: bytes, value: int) -> Optional[int]:
151150 return None
152151
153152 def lookup (self , fragment : bytes ) -> Optional [int ]:
154- """ Lookup the given key fragment in this node and its children
155- as necessary """
153+ """Lookup the given key fragment in this node and its children
154+ as necessary"""
156155 if not fragment :
157156 # We've arrived at our destination: return the value
158157 return self .value
@@ -165,7 +164,7 @@ def lookup(self, fragment: bytes) -> Optional[int]:
165164 for child in self .children :
166165 if fragment .startswith (child .fragment ):
167166 # This is a continuation route: take it
168- return child .lookup (fragment [len (child .fragment ):])
167+ return child .lookup (fragment [len (child .fragment ) :])
169168 # No route matches: the key was not found
170169 return None
171170
@@ -176,29 +175,28 @@ def __str__(self) -> str:
176175
177176
178177class Trie :
179-
180- """ Wrapper class for a radix (compact) trie data structure.
181- Each node in the trie contains a prefix string, leading
182- to its children. """
178+ """Wrapper class for a radix (compact) trie data structure.
179+ Each node in the trie contains a prefix string, leading
180+ to its children."""
183181
184182 def __init__ (
185- self , root_fragment : bytes = b"" , reserve_zero_for_empty : bool = True
183+ self , root_fragment : bytes = b"" , reserve_zero_for_empty : bool = True
186184 ) -> None :
187185 # We reserve the 0 index for the empty string
188186 self ._cnt = 1 if reserve_zero_for_empty else 0
189187 self ._root = _Node (root_fragment , None )
190188
191189 @property
192190 def root (self ) -> _Node :
193- """ Return the root node of the trie """
191+ """Return the root node of the trie"""
194192 return self ._root
195193
196- def add (self , key : bytes , value : Optional [int ]= None ) -> int :
197- """ Add the given (key, value) pair to the trie.
198- Duplicates are not allowed and not added to the trie.
199- If the value is None, it is set to the number of entries
200- already in the trie, thereby making it function as
201- an automatic generator of list indices. """
194+ def add (self , key : bytes , value : Optional [int ] = None ) -> int :
195+ """Add the given (key, value) pair to the trie.
196+ Duplicates are not allowed and not added to the trie.
197+ If the value is None, it is set to the number of entries
198+ already in the trie, thereby making it function as
199+ an automatic generator of list indices."""
202200 if not key :
203201 return 0
204202 if value is None :
@@ -212,16 +210,16 @@ def add(self, key: bytes, value: Optional[int]=None) -> int:
212210 self ._cnt += 1
213211 return value
214212
215- def get (self , key : bytes , default : Optional [int ]= None ) -> Optional [int ]:
216- """ Lookup the given key and return the associated value,
217- or the default if the key is not found. """
213+ def get (self , key : bytes , default : Optional [int ] = None ) -> Optional [int ]:
214+ """Lookup the given key and return the associated value,
215+ or the default if the key is not found."""
218216 if not key :
219217 return 0
220218 value = self ._root .lookup (key )
221219 return default if value is None else value
222220
223221 def __getitem__ (self , key : bytes ) -> int :
224- """ Lookup in square bracket notation """
222+ """Lookup in square bracket notation"""
225223 if not key :
226224 return 0
227225 value = self ._root .lookup (key )
@@ -230,12 +228,12 @@ def __getitem__(self, key: bytes) -> int:
230228 return value
231229
232230 def __len__ (self ) -> int :
233- """ Return the number of unique keys within the trie,
234- including the empty string sentinel that has the value 0 """
231+ """Return the number of unique keys within the trie,
232+ including the empty string sentinel that has the value 0"""
235233 return self ._cnt
236234
237- def write (self , f : BinaryIO , * , verbose : bool = False ) -> None :
238- """ Write the unigram trie contents to a packed binary stream """
235+ def write (self , f : BinaryIO , * , verbose : bool = False ) -> None :
236+ """Write the unigram trie contents to a packed binary stream"""
239237 # We assume that the alphabet can be represented in 7 bits
240238 todo : deque [Tuple [_Node , int ]] = deque ()
241239 node_cnt = 0
@@ -245,9 +243,9 @@ def write(self, f: BinaryIO, *, verbose: bool=False) -> None:
245243 max_distance = 0
246244
247245 def write_node (node : _Node , parent_loc : int ) -> None :
248- """ Write a single node to the packed binary stream,
249- and fix up the parent's pointer to the location
250- of this node """
246+ """Write a single node to the packed binary stream,
247+ and fix up the parent's pointer to the location
248+ of this node"""
251249 loc = f .tell ()
252250 val = 0x007FFFFF if node .value is None else node .value
253251 assert val < 2 ** 23
@@ -269,10 +267,7 @@ def write_node(node: _Node, parent_loc: int) -> None:
269267 assert chix < 2 ** 7
270268 f .write (
271269 UINT32 .pack (
272- 0x80000000
273- | childless_bit
274- | (chix << 23 )
275- | (val & 0x007FFFFF )
270+ 0x80000000 | childless_bit | (chix << 23 ) | (val & 0x007FFFFF )
276271 )
277272 )
278273 single_char_node_count += 1
@@ -320,9 +315,9 @@ def write_node(node: _Node, parent_loc: int) -> None:
320315 if verbose :
321316 print (
322317 "Written {0:,} nodes, thereof {1:,} single-char nodes "
323- "and {2:,} multi-char."
324- .format (node_cnt , single_char_node_count , multi_char_node_count )
318+ "and {2:,} multi-char." .format (
319+ node_cnt , single_char_node_count , multi_char_node_count
320+ )
325321 )
326322 print ("Childless nodes are {0:,}." .format (no_child_node_count ))
327323 print ("Maximum fixup distance is {0:,} bytes." .format (max_distance ))
328-
0 commit comments