Skip to content
This repository was archived by the owner on Jan 21, 2026. It is now read-only.

Commit 62fd3e2

Browse files
author
Vicent Marti
committed
tokenizer: Refactor ASCII-only helpers
The ascii-only helpers in the tokenizer should be used in other parts of the codebase (namely: when comparing tag names case-insensitively). Hence, export them on the util.h header.
1 parent a87add3 commit 62fd3e2

File tree

3 files changed

+49
-42
lines changed

3 files changed

+49
-42
lines changed

src/tag.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// Author: jdtang@google.com (Jonathan Tang)
1616

1717
#include "gumbo.h"
18+
#include "util.h"
1819

1920
#include <assert.h>
2021
#include <ctype.h>
@@ -60,14 +61,21 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
6061
}
6162
}
6263

64+
/*
65+
* Override the `tolower` implementation in the perfect hash
66+
* to use ours. We need a custom `tolower` that only does ASCII
67+
* characters and is locale-independent to remain truthy to the
68+
* standard
69+
*/
70+
#define tolower(c) gumbo_tolower(c)
6371
#include "tag_perf.h"
6472

6573
static int
6674
case_memcmp(const char *s1, const char *s2, int n)
6775
{
6876
while (n--) {
69-
unsigned char c1 = tolower(*s1++);
70-
unsigned char c2 = tolower(*s2++);
77+
unsigned char c1 = gumbo_tolower(*s1++);
78+
unsigned char c2 = gumbo_tolower(*s2++);
7179
if (c1 != c2)
7280
return (int)c1 - (int)c2;
7381
}

src/tokenizer.c

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -308,17 +308,6 @@ static void tokenizer_add_parse_error(GumboParser* parser, GumboErrorType type)
308308
}
309309
}
310310

311-
static bool is_alpha(int c) {
312-
// We don't use ISO C isupper/islower functions here because they
313-
// depend upon the program's locale, while the behavior of the HTML5 spec is
314-
// independent of which locale the program is run in.
315-
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
316-
}
317-
318-
static int ensure_lowercase(int c) {
319-
return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
320-
}
321-
322311
static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323312
if (is_in_cdata && c > 0) {
324313
return GUMBO_TOKEN_CDATA;
@@ -689,9 +678,9 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
689678
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
690679
GumboTagState* tag_state = &tokenizer->_tag_state;
691680
int c = utf8iterator_current(&tokenizer->_input);
692-
assert(is_alpha(c));
693-
c = ensure_lowercase(c);
694-
assert(is_alpha(c));
681+
assert(gumbo_isalpha(c));
682+
c = gumbo_tolower(c);
683+
assert(gumbo_isalpha(c));
695684

696685
initialize_tag_buffer(parser);
697686
gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
@@ -1027,7 +1016,7 @@ static StateResult handle_tag_open_state(
10271016
tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
10281017
return NEXT_CHAR;
10291018
default:
1030-
if (is_alpha(c)) {
1019+
if (gumbo_isalpha(c)) {
10311020
gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
10321021
start_new_tag(parser, true);
10331022
return NEXT_CHAR;
@@ -1055,7 +1044,7 @@ static StateResult handle_end_tag_open_state(
10551044
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
10561045
return emit_temporary_buffer(parser, output);
10571046
default:
1058-
if (is_alpha(c)) {
1047+
if (gumbo_isalpha(c)) {
10591048
gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
10601049
start_new_tag(parser, false);
10611050
} else {
@@ -1098,7 +1087,7 @@ static StateResult handle_tag_name_state(
10981087
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
10991088
return NEXT_CHAR;
11001089
default:
1101-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1090+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
11021091
return NEXT_CHAR;
11031092
}
11041093
}
@@ -1124,7 +1113,7 @@ static StateResult handle_rcdata_end_tag_open_state(
11241113
GumboParser* parser, GumboTokenizerState* tokenizer,
11251114
int c, GumboToken* output) {
11261115
assert(temporary_buffer_equals(parser, "</"));
1127-
if (is_alpha(c)) {
1116+
if (gumbo_isalpha(c)) {
11281117
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
11291118
start_new_tag(parser, false);
11301119
append_char_to_temporary_buffer(parser, c);
@@ -1141,8 +1130,8 @@ static StateResult handle_rcdata_end_tag_name_state(
11411130
GumboParser* parser, GumboTokenizerState* tokenizer,
11421131
int c, GumboToken* output) {
11431132
assert(tokenizer->_temporary_buffer.length >= 2);
1144-
if (is_alpha(c)) {
1145-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1133+
if (gumbo_isalpha(c)) {
1134+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
11461135
append_char_to_temporary_buffer(parser, c);
11471136
return NEXT_CHAR;
11481137
} else if (is_appropriate_end_tag(parser)) {
@@ -1190,7 +1179,7 @@ static StateResult handle_rawtext_end_tag_open_state(
11901179
GumboParser* parser, GumboTokenizerState* tokenizer,
11911180
int c, GumboToken* output) {
11921181
assert(temporary_buffer_equals(parser, "</"));
1193-
if (is_alpha(c)) {
1182+
if (gumbo_isalpha(c)) {
11941183
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
11951184
start_new_tag(parser, false);
11961185
append_char_to_temporary_buffer(parser, c);
@@ -1208,8 +1197,8 @@ static StateResult handle_rawtext_end_tag_name_state(
12081197
assert(tokenizer->_temporary_buffer.length >= 2);
12091198
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
12101199
tokenizer->_tag_state._buffer.data);
1211-
if (is_alpha(c)) {
1212-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1200+
if (gumbo_isalpha(c)) {
1201+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
12131202
append_char_to_temporary_buffer(parser, c);
12141203
return NEXT_CHAR;
12151204
} else if (is_appropriate_end_tag(parser)) {
@@ -1262,7 +1251,7 @@ static StateResult handle_script_end_tag_open_state(
12621251
GumboParser* parser, GumboTokenizerState* tokenizer,
12631252
int c, GumboToken* output) {
12641253
assert(temporary_buffer_equals(parser, "</"));
1265-
if (is_alpha(c)) {
1254+
if (gumbo_isalpha(c)) {
12661255
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
12671256
start_new_tag(parser, false);
12681257
append_char_to_temporary_buffer(parser, c);
@@ -1278,8 +1267,8 @@ static StateResult handle_script_end_tag_name_state(
12781267
GumboParser* parser, GumboTokenizerState* tokenizer,
12791268
int c, GumboToken* output) {
12801269
assert(tokenizer->_temporary_buffer.length >= 2);
1281-
if (is_alpha(c)) {
1282-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1270+
if (gumbo_isalpha(c)) {
1271+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
12831272
append_char_to_temporary_buffer(parser, c);
12841273
return NEXT_CHAR;
12851274
} else if (is_appropriate_end_tag(parser)) {
@@ -1421,11 +1410,11 @@ static StateResult handle_script_escaped_lt_state(
14211410
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
14221411
append_char_to_temporary_buffer(parser, c);
14231412
return NEXT_CHAR;
1424-
} else if (is_alpha(c)) {
1413+
} else if (gumbo_isalpha(c)) {
14251414
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
14261415
append_char_to_temporary_buffer(parser, c);
14271416
gumbo_string_buffer_append_codepoint(
1428-
ensure_lowercase(c), &tokenizer->_script_data_buffer);
1417+
gumbo_tolower(c), &tokenizer->_script_data_buffer);
14291418
return emit_temporary_buffer(parser, output);
14301419
} else {
14311420
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1438,7 +1427,7 @@ static StateResult handle_script_escaped_end_tag_open_state(
14381427
GumboParser* parser, GumboTokenizerState* tokenizer,
14391428
int c, GumboToken* output) {
14401429
assert(temporary_buffer_equals(parser, "</"));
1441-
if (is_alpha(c)) {
1430+
if (gumbo_isalpha(c)) {
14421431
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
14431432
start_new_tag(parser, false);
14441433
append_char_to_temporary_buffer(parser, c);
@@ -1454,8 +1443,8 @@ static StateResult handle_script_escaped_end_tag_name_state(
14541443
GumboParser* parser, GumboTokenizerState* tokenizer,
14551444
int c, GumboToken* output) {
14561445
assert(tokenizer->_temporary_buffer.length >= 2);
1457-
if (is_alpha(c)) {
1458-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1446+
if (gumbo_isalpha(c)) {
1447+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
14591448
append_char_to_temporary_buffer(parser, c);
14601449
return NEXT_CHAR;
14611450
} else if (is_appropriate_end_tag(parser)) {
@@ -1498,9 +1487,9 @@ static StateResult handle_script_double_escaped_start_state(
14981487
? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
14991488
return emit_current_char(parser, output);
15001489
default:
1501-
if (is_alpha(c)) {
1490+
if (gumbo_isalpha(c)) {
15021491
gumbo_string_buffer_append_codepoint(
1503-
ensure_lowercase(c), &tokenizer->_script_data_buffer);
1492+
gumbo_tolower(c), &tokenizer->_script_data_buffer);
15041493
return emit_current_char(parser, output);
15051494
} else {
15061495
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
@@ -1616,9 +1605,9 @@ static StateResult handle_script_double_escaped_end_state(
16161605
? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
16171606
return emit_current_char(parser, output);
16181607
default:
1619-
if (is_alpha(c)) {
1608+
if (gumbo_isalpha(c)) {
16201609
gumbo_string_buffer_append_codepoint(
1621-
ensure_lowercase(c), &tokenizer->_script_data_buffer);
1610+
gumbo_tolower(c), &tokenizer->_script_data_buffer);
16221611
return emit_current_char(parser, output);
16231612
} else {
16241613
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -1662,7 +1651,7 @@ static StateResult handle_before_attr_name_state(
16621651
// Fall through.
16631652
default:
16641653
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1665-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1654+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
16661655
return NEXT_CHAR;
16671656
}
16681657
}
@@ -1706,7 +1695,7 @@ static StateResult handle_attr_name_state(
17061695
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
17071696
// Fall through.
17081697
default:
1709-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1698+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
17101699
return NEXT_CHAR;
17111700
}
17121701
}
@@ -1747,7 +1736,7 @@ static StateResult handle_after_attr_name_state(
17471736
// Fall through.
17481737
default:
17491738
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1750-
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1739+
append_char_to_tag_buffer(parser, gumbo_tolower(c), true);
17511740
return NEXT_CHAR;
17521741
}
17531742
}
@@ -2298,7 +2287,7 @@ static StateResult handle_before_doctype_name_state(
22982287
default:
22992288
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
23002289
tokenizer->_doc_type_state.force_quirks = false;
2301-
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2290+
append_char_to_temporary_buffer(parser, gumbo_tolower(c));
23022291
return NEXT_CHAR;
23032292
}
23042293
}
@@ -2340,7 +2329,7 @@ static StateResult handle_doctype_name_state(
23402329
default:
23412330
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
23422331
tokenizer->_doc_type_state.force_quirks = false;
2343-
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2332+
append_char_to_temporary_buffer(parser, gumbo_tolower(c));
23442333
return NEXT_CHAR;
23452334
}
23462335
}

src/util.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ static inline void gumbo_free(void *ptr)
5757
gumbo_user_free(ptr);
5858
}
5959

60+
static inline int gumbo_tolower(int c)
61+
{
62+
return c | ((c >= 'A' && c <= 'Z') << 5);
63+
}
64+
65+
static inline bool gumbo_isalpha(int c)
66+
{
67+
return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
68+
}
69+
6070
// Debug wrapper for printf, to make it easier to turn off debugging info when
6171
// required.
6272
void gumbo_debug(const char* format, ...);

0 commit comments

Comments
 (0)