Skip to content

Commit 9a71d04

Browse files
committed
Merge branch 'docs/sphinx-doc-formats'
2 parents 161bd25 + fb0002e commit 9a71d04

File tree

2 files changed

+126
-52
lines changed

2 files changed

+126
-52
lines changed

docs/preprocessing/normalization.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
```{eval-rst}
66
.. automodule:: nlpurify.preprocessing.normalization
7+
:member-order: bysource
78
```
89

910
</div>

nlpurify/preprocessing/normalization.py

Lines changed: 125 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,6 @@ class WhiteSpace(NormalizerBaseModel):
3636
also multiple white spaces does not add any value to a text and
3737
should thus be removed to normalize the text.
3838
39-
:param strip, lstrip, rstrip: Settings to strip white spaces from
40-
beginning or end of the string for normalization. By default,
41-
all the spaces are removed as they do not provide any
42-
additional information and is mostly an error in typing text.
43-
44-
:param newline: Strip new line characters from a multiple line
45-
(i.e., a paragraph or text from "text area") to get one single
46-
text, defaults to True.
47-
48-
:param newlinesep: A string value which defaults to the systems'
49-
default new line seperator ("\r\n" `CRLF` for windows, and
50-
"\n" `LF` for *nix based systems) to replace from string.
51-
52-
:param multispace: Replace multiple spaces which often reduces the
53-
models' performance, defaults to True.
54-
5539
A modular approach is now enabled which is derived from a base
5640
normalization class. The usage is as below:
5741
@@ -76,36 +60,90 @@ class WhiteSpace(NormalizerBaseModel):
7660

7761
strip : bool = Field(
7862
True,
79-
description = "Strip of trailing white spaces from text."
63+
description = """
64+
Strip white spaces from both the beginning and the end of the
65+
string for normalization. By default, all the spaces are
66+
removed as they do not provide any additional information for
67+
a LLM/NLP based models and reduces token counts.
68+
69+
When the attribute is set to ``True`` the alternate parameters
70+
:attr:`lstrip` and :attr:`rstrip` is ignored, check model
71+
validator for more information. This uses the Python in-built
72+
string function as in example below:
73+
74+
.. code-block:: python
75+
76+
text = " this is a long text "
77+
print(text.strip())
78+
>> 'this is a long text'
79+
80+
Further customization like specifying alternate set of
81+
characters to be removed from the string is also supported by
82+
using the :attr:`strip_chars` attribute, for more information
83+
check `docs <https://docs.python.org/3/library/stdtypes.html#str.strip>`_.
84+
"""
8085
)
8186

8287
lstrip : bool = Field(
8388
True,
84-
description = "Strip white spaces from beginning of text."
89+
description = """
90+
When set to true (default) removes the leading white characters
91+
from the string, or specify alternate set using
92+
:attr:`strip_chars` attribute.
93+
"""
8594
)
8695

8796
rstrip : bool = Field(
8897
True,
89-
description = "Strip white spaces from end of text."
98+
description = """
99+
When set to true (default) removes the trailing white
100+
characters from the string, or specify alternate set using
101+
:attr:`strip_chars` attribute.
102+
"""
103+
)
104+
105+
strip_chars : str = Field(
106+
None,
107+
description = """
108+
Custom set characters to be removed from the string. The
109+
argument is not a "prefix" or a "suffix" but a combination of
110+
all the values to be stripped. Check
111+
`docs <https://docs.python.org/3/library/stdtypes.html#str.strip>`_
112+
for more information.
113+
"""
90114
)
91115

92116
newline : bool = Field(
93117
True,
94-
description = "Strip any new line characters from text."
118+
description = """
119+
Strip new line characters from a multiple line (i.e., a
120+
paragraph or text from "text area") to get one single text,
121+
defaults to True. By default, :attr:`strip` removes new lines
122+
from the beginning and end, while this argument using string
123+
replace method to remove within lines - useful when the source
124+
text is paragraphed and needs to be cleaned.
125+
"""
95126

96127
)
97128

98129
# ? if new line is true, then also allow to provide new line
99130
# which defaults to the operating system default
100131
newlinesep : str = Field(
101132
default = os.linesep,
102-
description = "Default line seperator based on system."
133+
description = """
134+
A string value which defaults to the systems' default new line
135+
seperator ("\\r\\n" `CRLF` for windows, and "\\n" `LF` for
136+
*nix based systems) to replace from string.
137+
"""
103138
)
104139

105140
# ? remove multiple whitespace - uses regual expressions
106141
multispace : bool = Field(
107142
default = True,
108-
description = "Remove multiple spaces from text using regexp."
143+
description = """
144+
Replace multiple spaces using regular expressions, which often
145+
reduces the models' performance, defaults to True.
146+
"""
109147
)
110148

111149

@@ -114,14 +152,14 @@ def apply(self, text : str) -> str:
114152

115153
# first - strip the white space from beginning and end of text
116154
if self.strip:
117-
text = text.strip()
155+
text = text.strip(chars = self.strip_chars)
118156
else:
119157
if self.lstrip:
120-
text = text.lstrip()
158+
text = text.lstrip(chars = self.strip_chars)
121159
elif self.rstrip:
122-
text = text.strip()
160+
text = text.strip(chars = self.strip_chars)
123161
else:
124-
pass # todo raise invalid warning for combination
162+
pass # error is raised during model assertion
125163

126164
# second, remove new line characters from the text
127165
if self.newline:
@@ -140,6 +178,11 @@ def model_validator(self) -> object:
140178
Pydantic generic model validator which validates all the
141179
fields using the self.attribute parameter and is generic to
142180
the class.
181+
182+
:raises UserWarning: A warning is raised when the parameter
183+
does not follow specified directive. It is recommended to
184+
check the attribute settings before using :func:`.apply()`
185+
or it might generated unwanted output.
143186
"""
144187

145188
s, ls, rs = self.strip, self.lstrip, self.rstrip
@@ -167,21 +210,27 @@ class CaseFolding(NormalizerBaseModel):
167210
A Model to Normalize Case Folding from Texts
168211
169212
Case folding from raw data source is often in title case, or is in
170-
a mixed case which hinder the NLP/LLM model's performance. The
213+
a mixed case which may hinder the NLP/LLM model's performance. The
171214
general convention is to convert all to lower cases using native
172215
Python function :func:`lower()` which is available for strings.
173-
174-
The class provides a pydantic model which does the same thing and
175-
when used in a pipeline provides robust and dynamic type checking
176-
and adheres to the normalization process.
177-
178-
:param upper, lower: Either set the text to upper case, or to
179-
lower case as per user choice. Default configuration sets the
180-
value to lower case.
181216
"""
182217

183-
upper : bool = False
184-
lower : bool = True
218+
upper : bool = Field(
219+
False,
220+
description = """
221+
Convert the text to upper case and return the text without
222+
altering other things. Defaults to False, the class converts
223+
the text to lower case which is recommended in LLM/NLP models.
224+
"""
225+
)
226+
227+
lower : bool = Field(
228+
True,
229+
description = """
230+
Convert the contents fof the text to lower case (default) for
231+
an easy forward integration with LLM/NLP based models.
232+
"""
233+
)
185234

186235
def apply(self, text : str) -> str:
187236
"""
@@ -216,27 +265,51 @@ class StopWords(NormalizerBaseModel):
216265
that when removed from a text improves an NLP/LLM models'
217266
performance. By default, the model is set to use the stopwords in
218267
the English language.
268+
"""
219269

220-
:param language: A valid language name which is available and
221-
defined under :func:`nltk.corpus.stopwords`, defaults to the
222-
English language.
270+
language : str = Field(
271+
"english",
272+
description = """
273+
A valid language name which is available and defined under
274+
:func:`nltk.corpus.stopwords`, defaults to the English. To see
275+
a valid list of languages follow below.
223276
224-
:param extrawords: The model gives the flexibility to add extra
225-
words which will be treated as stopwords which are not already
226-
defined under the :func:`nltk.corpus.stopwords` function. This
227-
can be helpful in dynamic debuging and quick manipulation of
228-
text to check forward models performance.
277+
.. code-block:: python
229278
230-
:param excludewords: Opposite to ``extrawords`` this attribute
231-
helps in updating the stopwords by removing/excluding words
232-
from already defined set.
233-
"""
279+
import nltk
280+
281+
# download the corpus if not already available
282+
# nltk.download("stopwords")
283+
from nltk.corpus import stopwords
234284
235-
language : str = "english"
236-
extrawords : list = []
285+
# once downloaded and available, check available list:
286+
print(stopwords.fileids())
287+
288+
The code block is dependent on :mod:`nltk` for more information
289+
check `docs <https://www.nltk.org/index.html>`_.
290+
"""
291+
)
292+
293+
extrawords : list = Field(
294+
[],
295+
description = """
296+
The model gives the flexibility to add extra words which will
297+
be treated as stopwords which are not already defined under
298+
the :func:`nltk.corpus.stopwords` function. This can be
299+
helpful in dynamic debuging and quick manipulation of text to
300+
check forward models performance.
301+
"""
302+
)
237303

238304
# ..versionadded:: 2025-10-24 - also allow words to be excluded
239-
excludewords : list = []
305+
excludewords : list = Field(
306+
[],
307+
description = """
308+
Opposite to ``extrawords`` this attribute helps in updating
309+
the stopwords by removing/excluding words from the already
310+
defined words in ``stopwords.words(self.language)`` list.
311+
"""
312+
)
240313

241314
# ! by default, nltk library provides stopwords in lower case
242315
# however, we can override and set the value as per our case needs

0 commit comments

Comments
 (0)