@@ -9,26 +9,44 @@ import (
99 "unicode"
1010)
1111
12- // Types of inappropriateness
12+ // Types and severities of inappropriateness
1313//
14- // For compability, only pass a single Type or a bitwise OR of multiple Type's,
15- // and always reference Type's by name as their value may change from version
16- // to version. Other operations on Type's are not supported.
14+ // For compability, always reference them by name as their value may change
15+ // from version to version.
16+ //
17+ // Use a bitwise OR of multiple profanity classifications, and a bitwise AND to
18+ // specify a severity level (default Mild). The definition of Inappropriate
19+ // (mildly profane, mildly offensive, mildly sexual, or severely mean) serves
20+ // as a good example.
21+ //
22+ // Other operations on Type's are NOT supported.
23+ //
24+ // Severities sould be interpreteted on an "at least" basis, e.g. Mild means
25+ // Mild, Moderate, OR Severe.
1726type Type uint32
1827
1928const (
20- Profane Type = 1 << iota
29+ Profane Type = 0b111 << ( iota * 3 )
2130 Offensive
2231 Sexual
2332 Mean
2433 Spam
25- Inappropriate = Profane | Offensive | Sexual
34+ Inappropriate = Profane | Offensive | Sexual | ( Mean & Severe )
2635 Any = Profane | Offensive | Sexual | Spam | Mean
36+ )
2737
38+ const (
39+ Mild Type = 0b111_111_111_111_111
40+ Moderate = 0b110_110_110_110_110
41+ Severe = 0b100_100_100_100_100
42+ )
43+
44+ const (
2845 countableTypes = 4
2946
30- minMatchable rune = 0x0020
31- maxMatchable rune = 0x007E
47+ // A subset of the ASCII range that requires no sanitization
48+ minNormal rune = 0x0020
49+ maxNormal rune = 0x007E
3250)
3351
3452var (
@@ -59,15 +77,15 @@ func IsInappropriate(text string) bool {
5977// Is returns whether a phrase contains words matching the types flag, useful if
6078// checking only one type or set of types is needed
6179func Is (text string , types Type ) bool {
62- return Scan (text )& types > 0
80+ return Scan (text )& types != 0
6381}
6482
6583// Scan returns a bitmask of all types, useful if checking multiple types or
6684// sets of types is needed, without multiple calls to Is(text, types)
6785func Scan (text string ) (types Type ) {
6886 // Figure out if sanitization is needed, and if so, do it
6987 for _ , textRune := range text {
70- if textRune < minMatchable || maxMatchable < textRune {
88+ if textRune < minNormal || maxNormal < textRune {
7189 // Sanitize
7290 buf := make ([]byte , 0 , len (text ))
7391 _ , n , _ := transform .Append (removeAccentsTransform , buf , []byte (text ))
@@ -99,7 +117,7 @@ func Scan(text string) (types Type) {
99117 var replacement string
100118 if int (textByte ) < len (replacements ) {
101119 replacement = replacements [textByte ]
102- } else if textRune > maxMatchable {
120+ } else if textRune > maxNormal {
103121 replacement = runeReplacements [textRune ]
104122 if replacement == "" {
105123 lowerRune := unicode .ToLower (textRune )
@@ -118,19 +136,13 @@ func Scan(text string) (types Type) {
118136 textByte = replacement [0 ]
119137 textBytes = replacement
120138 matchable = true
121- case textRune < minMatchable || maxMatchable < textRune :
122- // Unhandled runes (not printable, not representable as byte, etc.)
123- // matchable = false implied
124- switch textRune {
125- case '\n' , '\r' , '\t' :
126- skippable = true
127- }
128139 default :
140+ // matchable = false implied
129141 switch textByte {
130142 case '*' : // these count as replacements
131143 replaced = true
132144 fallthrough
133- case ' ' , '~' , '-' , '_' , '.' , ',' : // false positives may contain these
145+ case ' ' , '~' , '-' , '_' , '.' , ',' , '\n' , '\r' , '\t' : // false positives may contain these
134146 skippable = true
135147 }
136148 }
@@ -204,18 +216,30 @@ func Scan(text string) (types Type) {
204216 separate = skippable || ! matchable
205217 }
206218
207- for i := 0 ; i < countableTypes ; i ++ {
208- if countableTypeLevels [i ] > 0 {
209- types |= 1 << i
219+ for i , level := range countableTypeLevels {
220+ var severity Type
221+
222+ if level >= 3 {
223+ severity = 0b100 // severe
224+ } else if level == 2 {
225+ severity = 0b010 // moderate
226+ } else if level == 1 {
227+ severity = 0b001 // mild
210228 }
229+
230+ types |= severity << (i * 3 )
211231 }
212232
213233 // Min length is arbitrary, but must be > 0 to avoid dividing by zero
214234 if len (text ) > 5 {
215235 spamPercent := (100 / 2 ) * (upperCount + repetitionCount ) / len (text )
216236
217- if spamPercent > 30 {
218- types |= Spam
237+ // TODO: Define severe spam
238+
239+ if spamPercent > 50 {
240+ types |= Spam & Moderate
241+ } else if spamPercent > 30 {
242+ types |= Spam & Mild
219243 }
220244 }
221245
0 commit comments