fix(client): fix polynomial regular expression used on uncontrolled data (#1330)

remarkablemark · web-flow · commit 35f32fabf1c8 · 2026-02-06T01:27:44.000-05:00
* fix(client): fix polynomial regular expression used on uncontrolled data https://github.com/remarkablemark/html-dom-parser/security/code-scanning/22 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/23 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/24 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/25 * refactor(client): replace regex with pure string-based `hasOpenTag` * refactor(client): move `hasOpenTag` from domparser to utilities * test(client): add utilities.test.ts
diff --git a/src/client/domparser.ts b/src/client/domparser.ts
@@ -1,15 +1,11 @@
-import { escapeSpecialCharacters } from './utilities';
+import { escapeSpecialCharacters, hasOpenTag } from './utilities';
 
 // constants
 const HTML = 'html';
 const HEAD = 'head';
 const BODY = 'body';
 const FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
 
-// match-all-characters in case of newlines (DOTALL)
-const HEAD_TAG_REGEX = /<head[^]*>/i;
-const BODY_TAG_REGEX = /<body[^]*>/i;
-
 // falls back to `parseFromString` if `createHTMLDocument` cannot be used
 /* eslint-disable @typescript-eslint/no-unused-vars */
 /* istanbul ignore start */
@@ -138,13 +134,13 @@ export default function domparser(html: string): NodeList {
 
       // the created document may come with filler head/body elements,
       // so make sure to remove them if they don't actually exist
-      if (!HEAD_TAG_REGEX.test(html)) {
+      if (!hasOpenTag(html, HEAD)) {
         const element = doc.querySelector(HEAD);
         /* istanbul ignore next */
         element?.parentNode?.removeChild(element);
       }
 
-      if (!BODY_TAG_REGEX.test(html)) {
+      if (!hasOpenTag(html, BODY)) {
         const element = doc.querySelector(BODY);
         /* istanbul ignore next */
         element?.parentNode?.removeChild(element);
@@ -158,7 +154,7 @@ export default function domparser(html: string): NodeList {
       const elements = parseFromDocument(html).querySelectorAll(firstTagName);
 
       // if there's a sibling element, then return both elements
-      if (BODY_TAG_REGEX.test(html) && HEAD_TAG_REGEX.test(html)) {
+      if (hasOpenTag(html, BODY) && hasOpenTag(html, HEAD)) {
         /* istanbul ignore next */
         return elements[0].parentNode?.childNodes ?? createNodeList();
       }
diff --git a/src/client/utilities.ts b/src/client/utilities.ts
@@ -57,6 +57,33 @@ function formatTagName(tagName: string): string {
   return tagName;
 }
 
+/**
+ * Checks if an HTML string contains an opening tag (case-insensitive).
+ *
+ * @param html - HTML string.
+ * @param tagName - Tag name to search for (e.g., 'head' or 'body').
+ * @returns - Whether the tag is found.
+ */
+export function hasOpenTag(html: string, tagName: string): boolean {
+  const openTag = '<' + tagName;
+  const index = html.toLowerCase().indexOf(openTag);
+
+  if (index === -1) {
+    return false;
+  }
+
+  const char = html[index + openTag.length];
+  // the character after the tag name must be '>' or whitespace (for attributes)
+  return (
+    char === '>' ||
+    char === ' ' ||
+    char === '\t' ||
+    char === '\n' ||
+    char === '\r' ||
+    char === '/'
+  );
+}
+
 /**
  * Escapes special characters before parsing.
  *
diff --git a/test/client/utilities.test.ts b/test/client/utilities.test.ts
@@ -0,0 +1,33 @@
+import { hasOpenTag } from '../../src/client/utilities';
+
+describe('hasOpenTag', () => {
+  it.each([
+    ['<head>', 'head'],
+    ['<body>', 'body'],
+    ['<Head>', 'head'],
+    ['<BODY>', 'body'],
+    ['<head class="foo">', 'head'],
+    ['<body id="bar">', 'body'],
+    ['<head\tclass="foo">', 'head'],
+    ['<head\nclass="foo">', 'head'],
+    ['<head\rclass="foo">', 'head'],
+    ['<head/>', 'head'],
+    ['<html><head></head><body></body></html>', 'head'],
+    ['<html><head></head><body></body></html>', 'body'],
+  ])('returns true for %s with tag %s', (html, tagName) => {
+    expect(hasOpenTag(html, tagName)).toBe(true);
+  });
+
+  it.each([
+    ['', 'head'],
+    ['<div></div>', 'head'],
+    ['<div></div>', 'body'],
+    ['<header>', 'head'],
+    ['<heading>', 'head'],
+    ['<bodyguard>', 'body'],
+    ['<nobody>', 'body'],
+    ['hello world', 'head'],
+  ])('returns false for %s with tag %s', (html, tagName) => {
+    expect(hasOpenTag(html, tagName)).toBe(false);
+  });
+});