@@ -36,22 +36,6 @@ class WhiteSpace(NormalizerBaseModel):
3636 also multiple white spaces does not add any value to a text and
3737 should thus be removed to normalize the text.
3838
39- :param strip, lstrip, rstrip: Settings to strip white spaces from
40- beginning or end of the string for normalization. By default,
41- all the spaces are removed as they do not provide any
42- additional information and is mostly an error in typing text.
43-
44- :param newline: Strip new line characters from a multiple line
45- (i.e., a paragraph or text from "text area") to get one single
46- text, defaults to True.
47-
48- :param newlinesep: A string value which defaults to the systems'
49- default new line seperator ("\r \n " `CRLF` for windows, and
50- "\n " `LF` for *nix based systems) to replace from string.
51-
52- :param multispace: Replace multiple spaces which often reduces the
53- models' performance, defaults to True.
54-
5539 A modular approach is now enabled which is derived from a base
5640 normalization class. The usage is as below:
5741
@@ -76,36 +60,90 @@ class WhiteSpace(NormalizerBaseModel):
7660
7761 strip : bool = Field (
7862 True ,
79- description = "Strip of trailing white spaces from text."
63+ description = """
64+ Strip white spaces from both the beginning and the end of the
65+ string for normalization. By default, all the spaces are
66+ removed as they do not provide any additional information for
67+ a LLM/NLP based models and reduces token counts.
68+
69+ When the attribute is set to ``True`` the alternate parameters
70+ :attr:`lstrip` and :attr:`rstrip` is ignored, check model
71+ validator for more information. This uses the Python in-built
72+ string function as in example below:
73+
74+ .. code-block:: python
75+
76+ text = " this is a long text "
77+ print(text.strip())
78+ >> 'this is a long text'
79+
80+ Further customization like specifying alternate set of
81+ characters to be removed from the string is also supported by
82+ using the :attr:`strip_chars` attribute, for more information
83+ check `docs <https://docs.python.org/3/library/stdtypes.html#str.strip>`_.
84+ """
8085 )
8186
8287 lstrip : bool = Field (
8388 True ,
84- description = "Strip white spaces from beginning of text."
89+ description = """
90+ When set to true (default) removes the leading white characters
91+ from the string, or specify alternate set using
92+ :attr:`strip_chars` attribute.
93+ """
8594 )
8695
8796 rstrip : bool = Field (
8897 True ,
89- description = "Strip white spaces from end of text."
98+ description = """
99+ When set to true (default) removes the trailing white
100+ characters from the string, or specify alternate set using
101+ :attr:`strip_chars` attribute.
102+ """
103+ )
104+
105+ strip_chars : str = Field (
106+ None ,
107+ description = """
108+ Custom set characters to be removed from the string. The
109+ argument is not a "prefix" or a "suffix" but a combination of
110+ all the values to be stripped. Check
111+ `docs <https://docs.python.org/3/library/stdtypes.html#str.strip>`_
112+ for more information.
113+ """
90114 )
91115
92116 newline : bool = Field (
93117 True ,
94- description = "Strip any new line characters from text."
118+ description = """
119+ Strip new line characters from a multiple line (i.e., a
120+ paragraph or text from "text area") to get one single text,
121+ defaults to True. By default, :attr:`strip` removes new lines
122+ from the beginning and end, while this argument using string
123+ replace method to remove within lines - useful when the source
124+ text is paragraphed and needs to be cleaned.
125+ """
95126
96127 )
97128
98129 # ? if new line is true, then also allow to provide new line
99130 # which defaults to the operating system default
100131 newlinesep : str = Field (
101132 default = os .linesep ,
102- description = "Default line seperator based on system."
133+ description = """
134+ A string value which defaults to the systems' default new line
135+ seperator ("\\ r\\ n" `CRLF` for windows, and "\\ n" `LF` for
136+ *nix based systems) to replace from string.
137+ """
103138 )
104139
105140 # ? remove multiple whitespace - uses regual expressions
106141 multispace : bool = Field (
107142 default = True ,
108- description = "Remove multiple spaces from text using regexp."
143+ description = """
144+ Replace multiple spaces using regular expressions, which often
145+ reduces the models' performance, defaults to True.
146+ """
109147 )
110148
111149
@@ -114,14 +152,14 @@ def apply(self, text : str) -> str:
114152
115153 # first - strip the white space from beginning and end of text
116154 if self .strip :
117- text = text .strip ()
155+ text = text .strip (chars = self . strip_chars )
118156 else :
119157 if self .lstrip :
120- text = text .lstrip ()
158+ text = text .lstrip (chars = self . strip_chars )
121159 elif self .rstrip :
122- text = text .strip ()
160+ text = text .strip (chars = self . strip_chars )
123161 else :
124- pass # todo raise invalid warning for combination
162+ pass # error is raised during model assertion
125163
126164 # second, remove new line characters from the text
127165 if self .newline :
@@ -140,6 +178,11 @@ def model_validator(self) -> object:
140178 Pydantic generic model validator which validates all the
141179 fields using the self.attribute parameter and is generic to
142180 the class.
181+
182+ :raises UserWarning: A warning is raised when the parameter
183+ does not follow specified directive. It is recommended to
184+ check the attribute settings before using :func:`.apply()`
185+ or it might generated unwanted output.
143186 """
144187
145188 s , ls , rs = self .strip , self .lstrip , self .rstrip
@@ -167,21 +210,27 @@ class CaseFolding(NormalizerBaseModel):
167210 A Model to Normalize Case Folding from Texts
168211
169212 Case folding from raw data source is often in title case, or is in
170- a mixed case which hinder the NLP/LLM model's performance. The
213+ a mixed case which may hinder the NLP/LLM model's performance. The
171214 general convention is to convert all to lower cases using native
172215 Python function :func:`lower()` which is available for strings.
173-
174- The class provides a pydantic model which does the same thing and
175- when used in a pipeline provides robust and dynamic type checking
176- and adheres to the normalization process.
177-
178- :param upper, lower: Either set the text to upper case, or to
179- lower case as per user choice. Default configuration sets the
180- value to lower case.
181216 """
182217
183- upper : bool = False
184- lower : bool = True
218+ upper : bool = Field (
219+ False ,
220+ description = """
221+ Convert the text to upper case and return the text without
222+ altering other things. Defaults to False, the class converts
223+ the text to lower case which is recommended in LLM/NLP models.
224+ """
225+ )
226+
227+ lower : bool = Field (
228+ True ,
229+ description = """
230+ Convert the contents fof the text to lower case (default) for
231+ an easy forward integration with LLM/NLP based models.
232+ """
233+ )
185234
186235 def apply (self , text : str ) -> str :
187236 """
@@ -216,27 +265,51 @@ class StopWords(NormalizerBaseModel):
216265 that when removed from a text improves an NLP/LLM models'
217266 performance. By default, the model is set to use the stopwords in
218267 the English language.
268+ """
219269
220- :param language: A valid language name which is available and
221- defined under :func:`nltk.corpus.stopwords`, defaults to the
222- English language.
270+ language : str = Field (
271+ "english" ,
272+ description = """
273+ A valid language name which is available and defined under
274+ :func:`nltk.corpus.stopwords`, defaults to the English. To see
275+ a valid list of languages follow below.
223276
224- :param extrawords: The model gives the flexibility to add extra
225- words which will be treated as stopwords which are not already
226- defined under the :func:`nltk.corpus.stopwords` function. This
227- can be helpful in dynamic debuging and quick manipulation of
228- text to check forward models performance.
277+ .. code-block:: python
229278
230- :param excludewords: Opposite to ``extrawords`` this attribute
231- helps in updating the stopwords by removing/excluding words
232- from already defined set.
233- """
279+ import nltk
280+
281+ # download the corpus if not already available
282+ # nltk.download("stopwords")
283+ from nltk.corpus import stopwords
234284
235- language : str = "english"
236- extrawords : list = []
285+ # once downloaded and available, check available list:
286+ print(stopwords.fileids())
287+
288+ The code block is dependent on :mod:`nltk` for more information
289+ check `docs <https://www.nltk.org/index.html>`_.
290+ """
291+ )
292+
293+ extrawords : list = Field (
294+ [],
295+ description = """
296+ The model gives the flexibility to add extra words which will
297+ be treated as stopwords which are not already defined under
298+ the :func:`nltk.corpus.stopwords` function. This can be
299+ helpful in dynamic debuging and quick manipulation of text to
300+ check forward models performance.
301+ """
302+ )
237303
238304 # ..versionadded:: 2025-10-24 - also allow words to be excluded
239- excludewords : list = []
305+ excludewords : list = Field (
306+ [],
307+ description = """
308+ Opposite to ``extrawords`` this attribute helps in updating
309+ the stopwords by removing/excluding words from the already
310+ defined words in ``stopwords.words(self.language)`` list.
311+ """
312+ )
240313
241314 # ! by default, nltk library provides stopwords in lower case
242315 # however, we can override and set the value as per our case needs
0 commit comments