@@ -46,22 +46,6 @@ class WhiteSpace(_base_normalize):
4646 also multiple white spaces does not add any value to a text and
4747 should thus be removed to normalize the text.
4848
49- :param strip, lstrip, rstrip: Settings to strip white spaces from
50- beginning or end of the string for normalization. By default,
51- all the spaces are removed as they do not provide any
52- additional information and is mostly an error in typing text.
53-
54- :param newline: Strip new line characters from a multiple line
55- (i.e., a paragraph or text from "text area") to get one single
56- text, defaults to True.
57-
58- :param newlinesep: A string value which defaults to the systems'
59- default new line seperator ("\r \n " `CRLF` for windows, and
60- "\n " `LF` for *nix based systems) to replace from string.
61-
62- :param multispace: Replace multiple spaces which often reduces the
63- models' performance, defaults to True.
64-
6549 A modular approach is now enabled which is derived from a base
6650 normalization class. The usage is as below:
6751
@@ -78,25 +62,52 @@ class WhiteSpace(_base_normalize):
7862 '''
7963
8064 print(model.apply(text)) # uses default settings
81- >> This is a uncleaned text with lots of extra white space.
65+ >> " This is a uncleaned text with lots of extra white space."
8266
8367 The model does not accept additional arguments and the function
8468 ``.apply()`` is used to clean and normalize white space from text.
69+
70+ .. rubric:: Additional Note(s)
71+
72+ The new line seperator is default to system, for windows based
73+ system the seperator is "\r \n " (i.e., ``CR LF`` notation), while
74+ for *nix based system it is "\n " (i.e., ``LF`` notation) default.
8575 """
8676
87- strip : bool = Field (
88- True , help = "Strip of trailing white spaces from text."
77+ strip : bool = Field (
78+ default = True ,
79+ description = "Strip of trailing white spaces from text."
80+
81+ )
82+ lstrip : bool = Field (
83+ default = True ,
84+ description = "Strip white spaces from beginning of text."
85+
86+ )
87+ rstrip : bool = Field (
88+ default = True ,
89+ description = "Strip white spaces from end of text."
90+
91+ )
92+ newline : bool = Field (
93+ default = True ,
94+ description = "Strip any new line characters from text."
95+
8996 )
90- lstrip : bool = True
91- rstrip : bool = True
92- newline : bool = True
9397
9498 # ? if new line is true, then also allow to provide new line
9599 # which defaults to the operating system default
96- newlinesep : str = os .linesep
100+ newlinesep : str = Field (
101+ default = os .linesep ,
102+ description = "Default line seperator based on system."
103+ )
97104
98105 # ? remove multiple whitespace - uses regual expressions
99- multispace : bool = True
106+ multispace : bool = Field (
107+ default = True ,
108+ description = "Remove multiple spaces from text using regexp."
109+
110+ )
100111
101112
102113 def apply (self , text : str ) -> str :
@@ -183,7 +194,8 @@ def normalize(
183194 may also involve uniform case, typically :attr:`string.lower()`
184195 that can be used to create a word vector.
185196
186- :param str text: The base uncleaned text, all the operations are
197+ :type text: str
198+ :param text: The base uncleaned text, all the operations are
187199 done on this text to return a cleaner version. The string can
188200 be single line, multi-line (example from "text area") and can
189201 have any type of escape characters.
@@ -220,17 +232,19 @@ def normalize(
220232 Please refer to the underlying functions for detailed keyword
221233 arguments associated with each normalization techique(s) as below:
222234
223- * **whitespace** : Associated with white space removal, check
224- the underlying validation class is :class:`WhiteSpace` for
225- more details.
235+ * **whitespace** : Associated with white space removal, the
236+ function takes in arguments associated with native string
237+ functions of Python, check :class:`WhiteSpace` for more
238+ informations.
226239
227- * **casefolding** : Associated to set uniform text case,
228- check the underlying validation class is :class:`CaseFolding`
229- for more details.
240+ * **casefolding** : Associated to set uniform text case, the
241+ model either converts all the string to upper case or in
242+ lower case using Python native string functions, for
243+ more details check signature of :class:`CaseFolding` class.
230244
231- * **stopwords** : Associated with white stop words removal,
232- check the underlying validation class is :class:`StopWords`
233- for more details.
245+ * **stopwords** : Associated with white stop words removal,
246+ check the underlying validation class is :class:`StopWords`
247+ for more details.
234248
235249 .. rubric:: Code Example(s)
236250
0 commit comments