Skip to content

Commit 35f32fa

Browse files
fix(client): fix polynomial regular expression used on uncontrolled data (#1330)
* fix(client): fix polynomial regular expression used on uncontrolled data https://github.com/remarkablemark/html-dom-parser/security/code-scanning/22 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/23 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/24 https://github.com/remarkablemark/html-dom-parser/security/code-scanning/25 * refactor(client): replace regex with pure string-based `hasOpenTag` * refactor(client): move `hasOpenTag` from domparser to utilities * test(client): add utilities.test.ts
1 parent b445606 commit 35f32fa

File tree

3 files changed

+64
-8
lines changed

3 files changed

+64
-8
lines changed

src/client/domparser.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,11 @@
1-
import { escapeSpecialCharacters } from './utilities';
1+
import { escapeSpecialCharacters, hasOpenTag } from './utilities';
22

33
// constants
44
const HTML = 'html';
55
const HEAD = 'head';
66
const BODY = 'body';
77
const FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
88

9-
// match-all-characters in case of newlines (DOTALL)
10-
const HEAD_TAG_REGEX = /<head[^]*>/i;
11-
const BODY_TAG_REGEX = /<body[^]*>/i;
12-
139
// falls back to `parseFromString` if `createHTMLDocument` cannot be used
1410
/* eslint-disable @typescript-eslint/no-unused-vars */
1511
/* istanbul ignore start */
@@ -138,13 +134,13 @@ export default function domparser(html: string): NodeList {
138134

139135
// the created document may come with filler head/body elements,
140136
// so make sure to remove them if they don't actually exist
141-
if (!HEAD_TAG_REGEX.test(html)) {
137+
if (!hasOpenTag(html, HEAD)) {
142138
const element = doc.querySelector(HEAD);
143139
/* istanbul ignore next */
144140
element?.parentNode?.removeChild(element);
145141
}
146142

147-
if (!BODY_TAG_REGEX.test(html)) {
143+
if (!hasOpenTag(html, BODY)) {
148144
const element = doc.querySelector(BODY);
149145
/* istanbul ignore next */
150146
element?.parentNode?.removeChild(element);
@@ -158,7 +154,7 @@ export default function domparser(html: string): NodeList {
158154
const elements = parseFromDocument(html).querySelectorAll(firstTagName);
159155

160156
// if there's a sibling element, then return both elements
161-
if (BODY_TAG_REGEX.test(html) && HEAD_TAG_REGEX.test(html)) {
157+
if (hasOpenTag(html, BODY) && hasOpenTag(html, HEAD)) {
162158
/* istanbul ignore next */
163159
return elements[0].parentNode?.childNodes ?? createNodeList();
164160
}

src/client/utilities.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,33 @@ function formatTagName(tagName: string): string {
5757
return tagName;
5858
}
5959

60+
/**
61+
* Checks if an HTML string contains an opening tag (case-insensitive).
62+
*
63+
* @param html - HTML string.
64+
* @param tagName - Tag name to search for (e.g., 'head' or 'body').
65+
* @returns - Whether the tag is found.
66+
*/
67+
export function hasOpenTag(html: string, tagName: string): boolean {
68+
const openTag = '<' + tagName;
69+
const index = html.toLowerCase().indexOf(openTag);
70+
71+
if (index === -1) {
72+
return false;
73+
}
74+
75+
const char = html[index + openTag.length];
76+
// the character after the tag name must be '>' or whitespace (for attributes)
77+
return (
78+
char === '>' ||
79+
char === ' ' ||
80+
char === '\t' ||
81+
char === '\n' ||
82+
char === '\r' ||
83+
char === '/'
84+
);
85+
}
86+
6087
/**
6188
* Escapes special characters before parsing.
6289
*

test/client/utilities.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { hasOpenTag } from '../../src/client/utilities';
2+
3+
describe('hasOpenTag', () => {
4+
it.each([
5+
['<head>', 'head'],
6+
['<body>', 'body'],
7+
['<Head>', 'head'],
8+
['<BODY>', 'body'],
9+
['<head class="foo">', 'head'],
10+
['<body id="bar">', 'body'],
11+
['<head\tclass="foo">', 'head'],
12+
['<head\nclass="foo">', 'head'],
13+
['<head\rclass="foo">', 'head'],
14+
['<head/>', 'head'],
15+
['<html><head></head><body></body></html>', 'head'],
16+
['<html><head></head><body></body></html>', 'body'],
17+
])('returns true for %s with tag %s', (html, tagName) => {
18+
expect(hasOpenTag(html, tagName)).toBe(true);
19+
});
20+
21+
it.each([
22+
['', 'head'],
23+
['<div></div>', 'head'],
24+
['<div></div>', 'body'],
25+
['<header>', 'head'],
26+
['<heading>', 'head'],
27+
['<bodyguard>', 'body'],
28+
['<nobody>', 'body'],
29+
['hello world', 'head'],
30+
])('returns false for %s with tag %s', (html, tagName) => {
31+
expect(hasOpenTag(html, tagName)).toBe(false);
32+
});
33+
});

0 commit comments

Comments
 (0)