@@ -311,7 +311,7 @@ private function handle(): self
311311 $ matchedText = $ match [0 ];
312312
313313 // Check if the match inappropriately spans across word boundaries
314- if ($ this ->isSpanningWordBoundary ($ matchedText )) {
314+ if ($ this ->isSpanningWordBoundary ($ matchedText, $ normalizedString , $ start )) {
315315 continue ; // Skip this match as it spans word boundaries
316316 }
317317
@@ -406,42 +406,83 @@ private function isInsideHexToken(string $string, int $start, int $length): bool
406406 /**
407407 * Determine whether a matched substring inappropriately spans word boundaries.
408408 */
409- private function isSpanningWordBoundary (string $ matchedText ): bool
409+ private function isSpanningWordBoundary (string $ matchedText, string $ fullString , int $ matchStart ): bool
410410 {
411- // If the match contains spaces, it might be spanning word boundaries
412- if (preg_match ('/\s+/ ' , $ matchedText )) {
413- $ parts = preg_split ('/\s+/ ' , $ matchedText );
414-
415- if (count ($ parts ) > 1 ) {
416- // Count how many parts are single characters
417- $ singleCharCount = 0 ;
418- foreach ($ parts as $ part ) {
419- if (strlen ($ part ) === 1 && preg_match ('/[a-z]/i ' , $ part )) {
420- $ singleCharCount ++;
421- }
422- }
411+ // No spaces = not spanning
412+ if (!preg_match ('/\s+/ ' , $ matchedText )) {
413+ return false ;
414+ }
423415
424- // If ALL parts are single characters, this is intentional obfuscation
425- // (e.g., "f u c k i n g") - allow it
426- if ($ singleCharCount === count ($ parts )) {
427- return false ;
428- }
416+ $ parts = preg_split ('/\s+/ ' , $ matchedText );
429417
430- // If SOME parts are single characters at edges, this is likely
431- // a cross-word match (e.g., "t êt" from "pourrait être") - reject it
432- $ firstPart = $ parts [0 ];
433- $ lastPart = end ($ parts );
418+ if (count ($ parts ) <= 1 ) {
419+ return false ;
420+ }
434421
435- if (strlen ($ lastPart ) === 1 && preg_match ('/[a-z]/i ' , $ lastPart )) {
436- return true ;
437- }
422+ // Count single-character parts
423+ $ singleCharCount = 0 ;
424+ foreach ($ parts as $ part ) {
425+ if (mb_strlen ($ part , 'UTF-8 ' ) === 1 && preg_match ('/[a-z]/iu ' , $ part )) {
426+ $ singleCharCount ++;
427+ }
428+ }
438429
439- if (strlen ($ firstPart ) === 1 && preg_match ('/[a-z]/i ' , $ firstPart )) {
440- return true ;
441- }
430+ // ALL parts are single characters = definitely intentional (e.g., "f u c k i n g")
431+ if ($ singleCharCount === count ($ parts )) {
432+ return false ;
433+ }
434+
435+ // Check if match is embedded in a larger word
436+ $ matchEnd = $ matchStart + mb_strlen ($ matchedText , 'UTF-8 ' );
437+
438+ $ embeddedAtStart = false ;
439+ $ embeddedAtEnd = false ;
440+
441+ // Character before match?
442+ if ($ matchStart > 0 ) {
443+ $ charBefore = mb_substr ($ fullString , $ matchStart - 1 , 1 , 'UTF-8 ' );
444+ if (preg_match ('/\w/u ' , $ charBefore )) {
445+ $ embeddedAtStart = true ;
446+ }
447+ }
448+
449+ // Character after match?
450+ if ($ matchEnd < mb_strlen ($ fullString , 'UTF-8 ' )) {
451+ $ charAfter = mb_substr ($ fullString , $ matchEnd , 1 , 'UTF-8 ' );
452+ if (preg_match ('/\w/u ' , $ charAfter )) {
453+ $ embeddedAtEnd = true ;
454+ }
455+ }
456+
457+ // If embedded on BOTH sides, it's completely within text - reject
458+ if ($ embeddedAtStart && $ embeddedAtEnd ) {
459+ return true ;
460+ }
461+
462+ // If embedded only at START: check if first part is single-char (likely accidental)
463+ // If first part is multi-char, the regex was just greedy - allow it
464+ if ($ embeddedAtStart && !$ embeddedAtEnd ) {
465+ $ firstPart = $ parts [0 ];
466+ // If first part is a single letter, this is likely accidental word spanning
467+ // (e.g., "s hit" from "musicals hit" where 's' is from "musicals")
468+ if (mb_strlen ($ firstPart , 'UTF-8 ' ) === 1 && preg_match ('/[a-z]/iu ' , $ firstPart )) {
469+ return true ;
470+ }
471+ // If first part is multi-char, the regex was greedy but there's still
472+ // a valid profanity in the non-embedded portion (e.g., "as @ss" from "has @ss")
473+ return false ;
474+ }
475+
476+ // If embedded only at END: check if last part is single-char (likely accidental)
477+ if (!$ embeddedAtStart && $ embeddedAtEnd ) {
478+ $ lastPart = end ($ parts );
479+ if (mb_strlen ($ lastPart , 'UTF-8 ' ) === 1 && preg_match ('/[a-z]/iu ' , $ lastPart )) {
480+ return true ;
442481 }
482+ return false ;
443483 }
444484
485+ // Standalone partial spacing = intentional obfuscation
445486 return false ;
446487 }
447488
0 commit comments