-
Notifications
You must be signed in to change notification settings - Fork 332
Fix markdown character escaping bug (issue #1236) #1741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4464,9 +4464,32 @@ def frag() -> Fragment: | |
| font_glyphs = self.current_font.cmap # type: ignore[union-attr] | ||
| else: | ||
| font_glyphs = [] | ||
| num_escape_chars = 0 | ||
|
|
||
| while text: | ||
| tlt = text[:3] ## get triples to check for escape character | ||
| if self.MARKDOWN_ESCAPE_CHARACTER == tlt[0] and tlt[1:] in [ | ||
| "**", | ||
| "__", | ||
| "~~", | ||
| "--", | ||
| ]: | ||
| ## remove the escape character | ||
| txt_frag.append(text[1]) | ||
| txt_frag.append(text[2]) | ||
| yield frag() | ||
| text = text[3:] | ||
| continue | ||
|
|
||
| if self.MARKDOWN_ESCAPE_CHARACTER == tlt[0:1] and \ | ||
| self.MARKDOWN_ESCAPE_CHARACTER == tlt[1:2]: | ||
| # double-escape, juste produce it | ||
| txt_frag.append(text[0]) | ||
| txt_frag.append(text[1]) | ||
| yield frag() | ||
| text = text[2:] | ||
| continue | ||
|
|
||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are removing characters from the input text even when |
||
| is_marker = text[:2] in ( | ||
| self.MARKDOWN_BOLD_MARKER, | ||
| self.MARKDOWN_ITALICS_MARKER, | ||
|
|
@@ -4509,29 +4532,19 @@ def frag() -> Fragment: | |
| and (not txt_frag or txt_frag[-1] != half_marker) | ||
| and (len(text) < 3 or text[2] != half_marker) | ||
| ): | ||
| txt_frag = ( | ||
| txt_frag[: -((num_escape_chars + 1) // 2)] | ||
| if num_escape_chars > 0 | ||
| else txt_frag | ||
| ) | ||
| if num_escape_chars % 2 == 0: | ||
| if txt_frag: | ||
| yield frag() | ||
| if text[:2] == self.MARKDOWN_BOLD_MARKER: | ||
| in_bold = not in_bold | ||
| if text[:2] == self.MARKDOWN_ITALICS_MARKER: | ||
| in_italics = not in_italics | ||
| if text[:2] == self.MARKDOWN_STRIKETHROUGH_MARKER: | ||
| in_strikethrough = not in_strikethrough | ||
| if text[:2] == self.MARKDOWN_UNDERLINE_MARKER: | ||
| in_underline = not in_underline | ||
| text = text[2:] | ||
| continue | ||
| num_escape_chars = ( | ||
| num_escape_chars + 1 | ||
| if text[0] == self.MARKDOWN_ESCAPE_CHARACTER | ||
| else 0 | ||
| ) | ||
| if txt_frag: | ||
| yield frag() | ||
| if text[:2] == self.MARKDOWN_BOLD_MARKER: | ||
| in_bold = not in_bold | ||
| if text[:2] == self.MARKDOWN_ITALICS_MARKER: | ||
| in_italics = not in_italics | ||
| if text[:2] == self.MARKDOWN_STRIKETHROUGH_MARKER: | ||
| in_strikethrough = not in_strikethrough | ||
| if text[:2] == self.MARKDOWN_UNDERLINE_MARKER: | ||
| in_underline = not in_underline | ||
| text = text[2:] | ||
| continue | ||
|
|
||
| is_link = self.MARKDOWN_LINK_REGEX.match(text) | ||
| if is_link: | ||
| link_text, link_dest, text = is_link.groups() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,40 @@ | |
| GSTATE_BI = GSTATE.copy() | ||
| GSTATE_BI.font_style = "BI" | ||
|
|
||
| def merge_fragments(fragments): | ||
| """ | ||
| Helper function for testing the escaping chracters | ||
| Will merge fragments that have different characters but same fragment.graphics_state | ||
| and same fragment.k and same fragment.link. | ||
| Example Input: | ||
| ( | ||
| Fragment(characters=['a'], graphics_state={000}, k=1, link=None), | ||
| Fragment(characters=['b'], graphics_state={000}, k=1, link=None) | ||
| ) | ||
| Example Output: | ||
| (Fragment(characters=['a', 'b'], graphics_state={000}, k=1, link=None)) | ||
| """ | ||
| if not fragments: | ||
| return [] | ||
|
|
||
| merged_fragments = [] | ||
| current_fragment = fragments[0] | ||
|
|
||
| for fragment in fragments[1:]: | ||
| if ( | ||
| fragment.graphics_state == current_fragment.graphics_state | ||
| and fragment.k == current_fragment.k | ||
| and fragment.link == current_fragment.link | ||
| ): | ||
| current_fragment.characters.extend(fragment.characters) | ||
| else: | ||
| merged_fragments.append(current_fragment) | ||
| current_fragment = fragment | ||
|
|
||
| merged_fragments.append(current_fragment) | ||
|
|
||
| return tuple(merged_fragments) | ||
|
|
||
|
|
||
| def test_markdown_parse_simple_ok(): | ||
| frags = tuple( | ||
|
|
@@ -35,63 +69,75 @@ def test_markdown_parse_simple_ok(): | |
|
|
||
|
|
||
| def test_markdown_parse_simple_ok_escaped(): | ||
| frags = tuple( | ||
| FPDF()._parse_chars( | ||
| "\\**bold\\**, \\__italics\\__ and \\--underlined\\-- escaped", True | ||
| frags = merge_fragments( | ||
| tuple( | ||
| FPDF()._parse_chars( | ||
| "\\**bold\\**, \\__italics\\__ and \\--underlined\\-- escaped", True) | ||
| ) | ||
| ) | ||
| expected = ( | ||
| Fragment("**bold**, __italics__ and --underlined-- escaped", GSTATE, k=PDF.k), | ||
| Fragment("**bold**, __italics__ and --underlined-- escaped", GSTATE, k=PDF.k), | ||
| ) | ||
| assert frags == expected | ||
| frags = tuple( | ||
| frags = merge_fragments (tuple( | ||
| FPDF()._parse_chars( | ||
| r"raw \**bold\**, \__italics\__ and \--underlined\-- escaped", True | ||
| ) | ||
| ) | ||
| )) | ||
| expected = ( | ||
| Fragment( | ||
| "raw **bold**, __italics__ and --underlined-- escaped", GSTATE, k=PDF.k | ||
| ), | ||
| ) | ||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("escape *\\*between marker*\\*", True)) | ||
| frags =( | ||
| tuple(FPDF()._parse_chars("escape *\\*between marker*\\*", True)) | ||
| ) | ||
| expected = (Fragment("escape *\\*between marker*\\*", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("escape **\\after marker**\\", True)) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("escape **\\after marker**\\", True)) | ||
| ) | ||
| expected = ( | ||
| Fragment("escape ", GSTATE, k=PDF.k), | ||
| Fragment("\\after marker", GSTATE_B, k=PDF.k), | ||
| Fragment("\\", GSTATE, k=PDF.k), | ||
| ) | ||
|
|
||
|
|
||
| def test_markdown_unrelated_escape(): | ||
| frags = tuple(FPDF()._parse_chars("unrelated \\ escape \\**bold\\**", True)) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("unrelated \\ escape \\**bold\\**", True)) | ||
| ) | ||
| expected = (Fragment("unrelated \\ escape **bold**", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
| frags = tuple( | ||
| FPDF()._parse_chars("unrelated \\\\ double escape \\**bold\\**", True) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("unrelated \\\\ double escape \\**bold\\**", True)) | ||
| ) | ||
| expected = (Fragment("unrelated \\\\ double escape **bold**", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
|
|
||
|
|
||
| def test_markdown_parse_multiple_escape(): | ||
| frags = tuple(FPDF()._parse_chars("\\\\**bold\\\\** double escaped", True)) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("\\\\**bold\\\\** double escaped", True)) | ||
| ) | ||
| expected = ( | ||
| Fragment("\\", GSTATE, k=PDF.k), | ||
| Fragment("bold\\", GSTATE_B, k=PDF.k), | ||
| Fragment("\\\\", GSTATE, k=PDF.k), | ||
| Fragment("bold\\\\", GSTATE_B, k=PDF.k), | ||
| Fragment(" double escaped", GSTATE, k=PDF.k), | ||
| ) | ||
|
|
||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("\\\\\\**triple bold\\\\\\** escaped", True)) | ||
| expected = (Fragment("\\**triple bold\\** escaped", GSTATE, k=PDF.k),) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("\\\\\\**triple bold\\\\\\** escaped", True)) | ||
| ) | ||
| expected = (Fragment("\\\\**triple bold\\\\** escaped", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
|
|
||
|
|
||
| def test_markdown_parse_overlapping(): | ||
| frags = tuple(FPDF()._parse_chars("**bold __italics__**", True)) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("**bold __italics__**", True)) | ||
| ) | ||
| expected = ( | ||
| Fragment("bold ", GSTATE_B, k=PDF.k), | ||
| Fragment("italics", GSTATE_BI, k=PDF.k), | ||
|
|
@@ -100,13 +146,14 @@ def test_markdown_parse_overlapping(): | |
|
|
||
|
|
||
| def test_markdown_parse_overlapping_escaped(): | ||
| frags = tuple(FPDF()._parse_chars("**bold \\__italics\\__**", True)) | ||
| frags = merge_fragments( | ||
| tuple(FPDF()._parse_chars("**bold \\__italics\\__**", True)) | ||
| ) | ||
| expected = (Fragment("bold __italics__", GSTATE_B, k=PDF.k),) | ||
| assert frags == expected | ||
|
|
||
|
|
||
| def test_markdown_parse_crossing_markers(): | ||
| frags = tuple(FPDF()._parse_chars("**bold __and** italics__", True)) | ||
| frags = merge_fragments (tuple(FPDF()._parse_chars("**bold __and** italics__", True))) | ||
| expected = ( | ||
| Fragment("bold ", GSTATE_B, k=PDF.k), | ||
| Fragment("and", GSTATE_BI, k=PDF.k), | ||
|
|
@@ -116,7 +163,7 @@ def test_markdown_parse_crossing_markers(): | |
|
|
||
|
|
||
| def test_markdown_parse_crossing_markers_escaped(): | ||
| frags = tuple(FPDF()._parse_chars("**bold __and\\** italics__", True)) | ||
| frags =merge_fragments (tuple(FPDF()._parse_chars("**bold __and\\** italics__", True))) | ||
| expected = ( | ||
| Fragment("bold ", GSTATE_B, k=PDF.k), | ||
| Fragment("and** italics", GSTATE_BI, k=PDF.k), | ||
|
|
@@ -125,7 +172,7 @@ def test_markdown_parse_crossing_markers_escaped(): | |
|
|
||
|
|
||
| def test_markdown_parse_unterminated(): | ||
| frags = tuple(FPDF()._parse_chars("**bold __italics__", True)) | ||
| frags = merge_fragments (tuple(FPDF()._parse_chars("**bold __italics__", True))) | ||
| expected = ( | ||
| Fragment("bold ", GSTATE_B, k=PDF.k), | ||
| Fragment("italics", GSTATE_BI, k=PDF.k), | ||
|
|
@@ -134,7 +181,7 @@ def test_markdown_parse_unterminated(): | |
|
|
||
|
|
||
| def test_markdown_parse_unterminated_escaped(): | ||
| frags = tuple(FPDF()._parse_chars("**bold\\** __italics__", True)) | ||
| frags = merge_fragments(tuple(FPDF()._parse_chars("**bold\\** __italics__", True))) | ||
| expected = ( | ||
| Fragment("bold** ", GSTATE_B, k=PDF.k), | ||
| Fragment("italics", GSTATE_BI, k=PDF.k), | ||
|
|
@@ -143,16 +190,22 @@ def test_markdown_parse_unterminated_escaped(): | |
|
|
||
|
|
||
| def test_markdown_parse_line_of_markers(): | ||
| frags = tuple(FPDF()._parse_chars("*** woops", True)) | ||
| frags = merge_fragments (tuple(FPDF()._parse_chars("*** woops", True))) | ||
| expected = (Fragment("*** woops", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("----------", True)) | ||
| frags = merge_fragments (tuple(FPDF()._parse_chars("----------", True))) | ||
| expected = (Fragment("----------", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("****BOLD**", True)) | ||
| expected = (Fragment("****BOLD", GSTATE, k=PDF.k),) | ||
| expected = (Fragment("**BOLD", GSTATE_B, k=PDF.k),) | ||
| frags = merge_fragments(tuple(FPDF()._parse_chars("\\****BOLD**\\**", True))) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you missing an |
||
| expected = ( | ||
| Fragment("**", GSTATE, k=PDF.k), | ||
| Fragment("BOLD", GSTATE_B, k=PDF.k), | ||
| Fragment("**", GSTATE, k=PDF.k),) | ||
|
|
||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("* **BOLD**", True)) | ||
| frags = merge_fragments (tuple(FPDF()._parse_chars("* **BOLD**", True))) | ||
| expected = ( | ||
| Fragment("* ", GSTATE, k=PDF.k), | ||
| Fragment("BOLD", GSTATE_B, k=PDF.k), | ||
|
|
@@ -161,17 +214,20 @@ def test_markdown_parse_line_of_markers(): | |
|
|
||
|
|
||
| def test_markdown_parse_line_of_markers_escaped(): | ||
| frags = tuple(FPDF()._parse_chars("\\****BOLD**", True)) | ||
| expected = (Fragment("\\****BOLD", GSTATE, k=PDF.k),) | ||
| frags = merge_fragments(tuple(FPDF()._parse_chars("\\****BOLD**", True))) | ||
| expected = ( | ||
| Fragment("**", GSTATE, k=PDF.k), | ||
| Fragment("BOLD", GSTATE_B, k=PDF.k), | ||
| ) | ||
| assert frags == expected | ||
| frags = tuple(FPDF()._parse_chars("*\\***BOLD**", True)) | ||
| expected = (Fragment("*\\***BOLD", GSTATE, k=PDF.k),) | ||
| frags = merge_fragments(tuple(FPDF()._parse_chars("*\\***BOLD**", True))) | ||
| expected = (Fragment("****BOLD", GSTATE, k=PDF.k),) | ||
| assert frags == expected | ||
|
|
||
|
|
||
| def test_markdown_parse_newline_after_markdown_link(): # issue 916 | ||
| text = "[fpdf2](https://py-pdf.github.io/fpdf2/)\nGo visit it!" | ||
| frags = tuple(FPDF()._parse_chars(text, True)) | ||
| frags = merge_fragments(tuple(FPDF()._parse_chars(text, True))) | ||
| gstate_link = GSTATE.copy() | ||
| gstate_link.underline = True | ||
| expected = ( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change would make it easier to read - less 'magic values' across the code.