Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
63b700f
fix(markdown): normalize link destinations and titles in html test re…
jfmcdowell Jan 25, 2026
118e210
fix(markdown): improve link and reference parsing
jfmcdowell Jan 25, 2026
9a4dae0
fix(markdown): enhance inline element parsing and emphasis
jfmcdowell Jan 25, 2026
b6763e5
fix(markdown): refine block structure and list handling
jfmcdowell Jan 25, 2026
14bedaa
fix(markdown): polish CommonMark conformance and HTML output
jfmcdowell Jan 28, 2026
52557fe
feat(string_case): add Unicode case folding helper
jfmcdowell Jan 29, 2026
a33d915
refactor(markdown): split inline parsing into modules
jfmcdowell Jan 29, 2026
292c747
refactor(markdown): simplify inline list parsing flow
jfmcdowell Jan 29, 2026
70787dd
refactor(markdown): standardize imports to use crate:: paths
jfmcdowell Jan 30, 2026
5589eea
refactor(markdown): move lexer imports to module scope
jfmcdowell Jan 30, 2026
f093390
refactor(markdown): use preorder visitor for to_html renderer
jfmcdowell Jan 30, 2026
80a87db
refactor(markdown): tidy list marker detection
jfmcdowell Jan 30, 2026
de946f8
refactor(markdown): fix imports
jfmcdowell Jan 30, 2026
ad53584
refactor(markdown): fix magic numbers
jfmcdowell Jan 30, 2026
897cdaf
fix(markdown): reuse casefold result when possible
jfmcdowell Jan 30, 2026
c1bd5ef
refactor(markdown): move link reference collection
jfmcdowell Jan 30, 2026
dfad87a
fix(markdown): align thematic break lookahead
jfmcdowell Jan 30, 2026
155b503
fix(markdown): gate utf8proc on non-wasm targets
jfmcdowell Jan 30, 2026
e584d59
test(markdown): add reference label unit tests
jfmcdowell Jan 31, 2026
f51e528
refactor(string_case): replace utf8proc with caseless for Unicode cas…
jfmcdowell Jan 31, 2026
0374ce0
fix(markdown): guard html block interrupt indent
jfmcdowell Jan 31, 2026
bba934e
fix(markdown): require EOL for thematic break fast path
jfmcdowell Jan 31, 2026
73cf570
fix(markdown): tighten percent-encoding validation
jfmcdowell Jan 31, 2026
270c923
chore(markdown): move caseless and percent-encoding
jfmcdowell Jan 31, 2026
e0f5be5
refactor(markdown): return Cow for label normalization
jfmcdowell Jan 31, 2026
2c87154
fix(markdown): prevent double-escaping in image alt text
jfmcdowell Jan 31, 2026
e63d4f8
Merge branch 'next' into fix/markdown-inline-edge-cases
ematipico Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion crates/biome_markdown_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ biome_markdown_factory = { workspace = true }
biome_markdown_syntax = { workspace = true }
biome_parser = { workspace = true }
biome_rowan = { workspace = true }
biome_string_case = { workspace = true }
biome_unicode_table = { workspace = true }
# Optional dependency for test_utils feature (HTML rendering for spec tests)
htmlize = { version = "1.0.6", features = ["unescape"], optional = true }
percent-encoding = { version = "2.3.2", optional = true }
tracing = { workspace = true }
unicode-bom = { workspace = true }

Expand All @@ -44,7 +46,7 @@ tests_macros = { path = "../tests_macros" }
[features]
# Enables test utilities (to_html module) for CommonMark spec compliance testing.
# Not included in production builds to avoid unnecessary dependencies and code.
test_utils = ["dep:htmlize"]
test_utils = ["dep:htmlize", "dep:percent-encoding"]

[lints]
workspace = true
75 changes: 58 additions & 17 deletions crates/biome_markdown_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ use biome_unicode_table::lookup_byte;
/// - `FencedCodeBlock`: Inside fenced code block, no markdown parsing
/// - `HtmlBlock`: Inside HTML block, minimal markdown parsing
/// - `LinkDefinition`: Inside link reference definition, whitespace separates tokens
/// - `CodeSpan`: Inside inline code span, backslashes are literal (no escapes)
/// - `EmphasisInline`: Emit single STAR/UNDERSCORE tokens for partial delimiter consumption
#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)]
pub enum MarkdownLexContext {
/// Normal markdown parsing with full inline element detection.
Expand All @@ -39,6 +41,16 @@ pub enum MarkdownLexContext {
/// In this context, whitespace is significant and separates destination from title.
/// Text tokens stop at whitespace to allow proper parsing.
LinkDefinition,
/// Inside an inline code span.
/// Per CommonMark §6.1, backslash escapes are not processed inside code spans.
/// Backslash is treated as a literal character, not an escape.
CodeSpan,
/// Inside emphasis delimiter processing.
/// In this context, `*` and `_` are always emitted as single-character tokens
/// (STAR, UNDERSCORE) rather than double tokens (DOUBLE_STAR, DOUBLE_UNDERSCORE).
/// This allows partial consumption of delimiter runs when the match algorithm
/// determines only 1 char should be used from a 2-char run.
EmphasisInline,
}

impl LexContext for MarkdownLexContext {
Expand All @@ -57,6 +69,10 @@ pub enum MarkdownReLexContext {
Regular,
/// Re-lex for link definition context where whitespace is significant.
LinkDefinition,
/// Re-lex for emphasis inline context where `*` and `_` emit single tokens.
/// Used when the emphasis matching algorithm needs to partially consume
/// a DOUBLE_STAR or DOUBLE_UNDERSCORE token.
EmphasisInline,
}

/// An extremely fast, lookup table based, lossless Markdown lexer
Expand Down Expand Up @@ -135,6 +151,7 @@ impl<'src> Lexer<'src> for MarkdownLexer<'src> {
// This ensures the *next* token (after NEWLINE) has PRECEDING_LINE_BREAK set.
if !kind.is_trivia()
&& kind != NEWLINE
&& kind != MD_HARD_LINE_LITERAL
&& !(kind == MD_TEXTUAL_LITERAL
&& self.after_newline
&& self.current_text_is_whitespace())
Expand Down Expand Up @@ -230,9 +247,14 @@ impl<'src> MarkdownLexer<'src> {
// - In middle of line: whitespace is just text content, include in textual token
// - Exception: 2+ spaces before newline is a hard line break
// - In LinkDefinition context: whitespace is always significant (separates destination from title)
// - In CodeSpan context: whitespace is literal content, no hard-line-break detection
WHS => {
if current == b'\n' || current == b'\r' {
self.consume_newline()
} else if matches!(context, MarkdownLexContext::CodeSpan) {
// In code span context, whitespace is literal content.
// No hard-line-break detection - the renderer normalizes line endings to spaces.
self.consume_textual(context)
} else if matches!(context, MarkdownLexContext::LinkDefinition) {
// In link definition context, whitespace separates tokens.
// We consume it as textual literal so it's not treated as trivia by the parser.
Expand Down Expand Up @@ -267,7 +289,15 @@ impl<'src> MarkdownLexer<'src> {
PNC => self.consume_byte(R_PAREN),
COL => self.consume_byte(COLON),
AMP => self.consume_entity_or_textual(context),
BSL => self.consume_escape(),
BSL => {
// Per CommonMark §6.1, backslash escapes are NOT processed inside code spans.
// Backslash is literal, so `\`` produces a literal backslash followed by backtick.
if matches!(context, MarkdownLexContext::CodeSpan) {
self.consume_textual(context)
} else {
self.consume_escape()
}
}
// = at line start could be setext heading underline
EQL if self.after_newline => self.consume_setext_underline_or_textual(),
_ => {
Expand Down Expand Up @@ -677,8 +707,8 @@ impl<'src> MarkdownLexer<'src> {
///
/// For `-` at line start:
/// - 1-2 dashes followed by newline: setext underline (H2)
/// - 3+ dashes followed by newline: thematic break (not setext, since the parser
/// will convert thematic breaks to setext headers when preceded by paragraph)
/// - 3+ dashes followed by newline: thematic break (not setext; the parser may
/// convert dash-only thematic breaks to setext when preceded by a paragraph)
fn consume_thematic_break_or_emphasis(
&mut self,
dispatched: Dispatch,
Expand All @@ -705,7 +735,7 @@ impl<'src> MarkdownLexer<'src> {

// For `-` at line start with 1-2 dashes, emit setext underline.
// 3+ dashes could be thematic break, so let that logic handle it.
// The parser will convert thematic breaks to setext when preceded by paragraph.
// The parser may convert dash-only thematic breaks to setext when preceded by paragraph.
if start_char == b'-' && self.after_newline {
let mut dash_count = 0;
// Consume only `-` characters (no spaces between)
Expand Down Expand Up @@ -753,6 +783,19 @@ impl<'src> MarkdownLexer<'src> {
// Not a thematic break - restore position and consume as emphasis marker
self.position = start_position;

// In EmphasisInline context, always emit single tokens for * and _.
// This allows partial consumption of delimiter runs when the match algorithm
// determines only 1 char should be used from a 2-char run.
if matches!(context, MarkdownLexContext::EmphasisInline) {
self.advance(1);
return match start_char {
b'*' => STAR,
b'_' => UNDERSCORE,
b'-' => MINUS,
_ => unreachable!(),
};
}

// Check for double emphasis markers (**, __)
// Note: -- is not valid markdown emphasis, so we don't check for it
if start_char != b'-' && self.peek_byte() == Some(start_char) {
Expand Down Expand Up @@ -834,21 +877,18 @@ impl<'src> MarkdownLexer<'src> {
let start_position = self.position;
let mut eq_count = 0;

// Consume all `=` and spaces
loop {
match self.current_byte() {
Some(b'=') => {
self.advance(1);
eq_count += 1;
}
Some(b' ') => {
self.advance(1);
}
_ => break,
}
// Consume only `=` characters — no spaces between (CommonMark §4.3)
while let Some(b'=') = self.current_byte() {
self.advance(1);
eq_count += 1;
}

// Allow optional trailing whitespace only
while matches!(self.current_byte(), Some(b' ' | b'\t')) {
self.advance(1);
}

// Must have at least one `=` and be followed by newline or EOF
// Must have at least one `=` and nothing else before newline or EOF
if eq_count >= 1 && matches!(self.current_byte(), Some(b'\n' | b'\r') | None) {
return MD_SETEXT_UNDERLINE_LITERAL;
}
Expand Down Expand Up @@ -1200,6 +1240,7 @@ impl<'src> ReLexer<'src> for MarkdownLexer<'src> {
let lex_context = match context {
MarkdownReLexContext::Regular => MarkdownLexContext::Regular,
MarkdownReLexContext::LinkDefinition => MarkdownLexContext::LinkDefinition,
MarkdownReLexContext::EmphasisInline => MarkdownLexContext::EmphasisInline,
};

let re_lexed_kind = match self.current_byte() {
Expand Down
56 changes: 5 additions & 51 deletions crates/biome_markdown_parser/src/link_reference.rs
Original file line number Diff line number Diff line change
@@ -1,46 +1,13 @@
use std::collections::HashSet;

use biome_markdown_syntax::{MdLinkLabel, MdLinkReferenceDefinition};
use biome_rowan::{AstNode, Direction};
use biome_markdown_syntax::MdLinkReferenceDefinition;
use biome_rowan::AstNode;

use crate::MarkdownLosslessTreeSink;
use crate::MarkdownParseOptions;
use crate::parser::MarkdownParser;
use crate::syntax::parse_document;

pub(crate) fn normalize_reference_label(text: &str) -> String {
let mut out = String::new();
let mut chars = text.chars().peekable();
let mut saw_whitespace = false;

while let Some(c) = chars.next() {
if c == '\\' {
if let Some(next) = chars.next() {
push_normalized_char(&mut out, next, &mut saw_whitespace);
}
continue;
}

if c.is_whitespace() {
saw_whitespace = true;
continue;
}

push_normalized_char(&mut out, c, &mut saw_whitespace);
}

out
}

fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) {
if *saw_whitespace && !out.is_empty() {
out.push(' ');
}
*saw_whitespace = false;
for lower in c.to_lowercase() {
out.push(lower);
}
}
use crate::syntax::reference::normalize_reference_label;

pub(crate) fn collect_link_reference_definitions(
source: &str,
Expand All @@ -61,26 +28,13 @@ pub(crate) fn collect_link_reference_definitions(
if let Some(def) = MdLinkReferenceDefinition::cast(node)
&& let Ok(label) = def.label()
{
let raw = collect_label_text(label);
let raw = label.syntax().text_trimmed().to_string();
let normalized = normalize_reference_label(&raw);
if !normalized.is_empty() {
definitions.insert(normalized);
definitions.insert(normalized.into_owned());
}
}
}

definitions
}

fn collect_label_text(label: MdLinkLabel) -> String {
let mut text = String::new();
for token in label
.content()
.syntax()
.descendants_with_tokens(Direction::Next)
.filter_map(|element| element.into_token())
{
text.push_str(token.text());
}
text
}
Loading
Loading