Skip to content

Commit e07e1d8

Browse files
deemonicclaude
andcommitted
fix: detect partial spacing profanity obfuscation
Profanity obfuscation using partial spacing was not being detected: - "s hit" not detected as "shit" - "f uck" not detected as "fuck" - "t wat" not detected as "twat" The isSpanningWordBoundary() method had overly strict logic that rejected legitimate partial spacing patterns. This fix modifies the method to check surrounding context instead of relying on heuristics about single-character parts: - If alphanumeric char immediately before match → embedded in word → reject - If alphanumeric char immediately after match → embedded in word → reject - Otherwise → standalone text, likely intentional obfuscation → allow Added 6 new test cases for partial spacing detection. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 5e1e0fc commit e07e1d8

File tree

2 files changed

+112
-29
lines changed

2 files changed

+112
-29
lines changed

src/BlaspService.php

Lines changed: 70 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ private function handle(): self
311311
$matchedText = $match[0];
312312

313313
// Check if the match inappropriately spans across word boundaries
314-
if ($this->isSpanningWordBoundary($matchedText)) {
314+
if ($this->isSpanningWordBoundary($matchedText, $normalizedString, $start)) {
315315
continue; // Skip this match as it spans word boundaries
316316
}
317317

@@ -406,42 +406,83 @@ private function isInsideHexToken(string $string, int $start, int $length): bool
406406
/**
407407
* Determine whether a matched substring inappropriately spans word boundaries.
408408
*/
409-
private function isSpanningWordBoundary(string $matchedText): bool
409+
private function isSpanningWordBoundary(string $matchedText, string $fullString, int $matchStart): bool
410410
{
411-
// If the match contains spaces, it might be spanning word boundaries
412-
if (preg_match('/\s+/', $matchedText)) {
413-
$parts = preg_split('/\s+/', $matchedText);
414-
415-
if (count($parts) > 1) {
416-
// Count how many parts are single characters
417-
$singleCharCount = 0;
418-
foreach ($parts as $part) {
419-
if (strlen($part) === 1 && preg_match('/[a-z]/i', $part)) {
420-
$singleCharCount++;
421-
}
422-
}
411+
// No spaces = not spanning
412+
if (!preg_match('/\s+/', $matchedText)) {
413+
return false;
414+
}
423415

424-
// If ALL parts are single characters, this is intentional obfuscation
425-
// (e.g., "f u c k i n g") - allow it
426-
if ($singleCharCount === count($parts)) {
427-
return false;
428-
}
416+
$parts = preg_split('/\s+/', $matchedText);
429417

430-
// If SOME parts are single characters at edges, this is likely
431-
// a cross-word match (e.g., "t êt" from "pourrait être") - reject it
432-
$firstPart = $parts[0];
433-
$lastPart = end($parts);
418+
if (count($parts) <= 1) {
419+
return false;
420+
}
434421

435-
if (strlen($lastPart) === 1 && preg_match('/[a-z]/i', $lastPart)) {
436-
return true;
437-
}
422+
// Count single-character parts
423+
$singleCharCount = 0;
424+
foreach ($parts as $part) {
425+
if (mb_strlen($part, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $part)) {
426+
$singleCharCount++;
427+
}
428+
}
438429

439-
if (strlen($firstPart) === 1 && preg_match('/[a-z]/i', $firstPart)) {
440-
return true;
441-
}
430+
// ALL parts are single characters = definitely intentional (e.g., "f u c k i n g")
431+
if ($singleCharCount === count($parts)) {
432+
return false;
433+
}
434+
435+
// Check if match is embedded in a larger word
436+
$matchEnd = $matchStart + mb_strlen($matchedText, 'UTF-8');
437+
438+
$embeddedAtStart = false;
439+
$embeddedAtEnd = false;
440+
441+
// Character before match?
442+
if ($matchStart > 0) {
443+
$charBefore = mb_substr($fullString, $matchStart - 1, 1, 'UTF-8');
444+
if (preg_match('/\w/u', $charBefore)) {
445+
$embeddedAtStart = true;
446+
}
447+
}
448+
449+
// Character after match?
450+
if ($matchEnd < mb_strlen($fullString, 'UTF-8')) {
451+
$charAfter = mb_substr($fullString, $matchEnd, 1, 'UTF-8');
452+
if (preg_match('/\w/u', $charAfter)) {
453+
$embeddedAtEnd = true;
454+
}
455+
}
456+
457+
// If embedded on BOTH sides, it's completely within text - reject
458+
if ($embeddedAtStart && $embeddedAtEnd) {
459+
return true;
460+
}
461+
462+
// If embedded only at START: check if first part is single-char (likely accidental)
463+
// If first part is multi-char, the regex was just greedy - allow it
464+
if ($embeddedAtStart && !$embeddedAtEnd) {
465+
$firstPart = $parts[0];
466+
// If first part is a single letter, this is likely accidental word spanning
467+
// (e.g., "s hit" from "musicals hit" where 's' is from "musicals")
468+
if (mb_strlen($firstPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $firstPart)) {
469+
return true;
470+
}
471+
// If first part is multi-char, the regex was greedy but there's still
472+
// a valid profanity in the non-embedded portion (e.g., "as @ss" from "has @ss")
473+
return false;
474+
}
475+
476+
// If embedded only at END: check if last part is single-char (likely accidental)
477+
if (!$embeddedAtStart && $embeddedAtEnd) {
478+
$lastPart = end($parts);
479+
if (mb_strlen($lastPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $lastPart)) {
480+
return true;
442481
}
482+
return false;
443483
}
444484

485+
// Standalone partial spacing = intentional obfuscation
445486
return false;
446487
}
447488

tests/BlaspCheckTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,4 +306,46 @@ public function test_spaced_profanity_without_substitution()
306306

307307
$this->assertTrue($result->hasProfanity);
308308
}
309+
310+
public function test_partial_spacing_s_hit()
311+
{
312+
$result = $this->blaspService->check('s hit');
313+
$this->assertTrue($result->hasProfanity);
314+
$this->assertContains('shit', $result->uniqueProfanitiesFound);
315+
}
316+
317+
public function test_partial_spacing_f_uck()
318+
{
319+
$result = $this->blaspService->check('f uck');
320+
$this->assertTrue($result->hasProfanity);
321+
$this->assertContains('fuck', $result->uniqueProfanitiesFound);
322+
}
323+
324+
public function test_partial_spacing_t_wat()
325+
{
326+
$result = $this->blaspService->check('t wat');
327+
$this->assertTrue($result->hasProfanity);
328+
$this->assertContains('twat', $result->uniqueProfanitiesFound);
329+
}
330+
331+
public function test_partial_spacing_fu_c_k()
332+
{
333+
$result = $this->blaspService->check('fu c k');
334+
$this->assertTrue($result->hasProfanity);
335+
$this->assertContains('fuck', $result->uniqueProfanitiesFound);
336+
}
337+
338+
public function test_partial_spacing_tw_a_t()
339+
{
340+
$result = $this->blaspService->check('tw a t');
341+
$this->assertTrue($result->hasProfanity);
342+
$this->assertContains('twat', $result->uniqueProfanitiesFound);
343+
}
344+
345+
public function test_no_false_positive_musicals_hit_embedded()
346+
{
347+
$result = $this->blaspService->check('This musicals hit');
348+
$this->assertFalse($result->hasProfanity);
349+
$this->assertSame('This musicals hit', $result->cleanString);
350+
}
309351
}

0 commit comments

Comments
 (0)