Handle footnotes and automatic URL linking

georgelemon · georgelemon · commit ca249d20ad1a · 2025-11-02T23:13:18.000+02:00
Signed-off-by: George Lemon &lt;georgelemon@protonmail.com&gt;
diff --git a/src/marvdown/ast.nim b/src/marvdown/ast.nim
@@ -27,6 +27,8 @@ type
     mdkHtml,           # Raw HTML content
     mdkTable,          # Table
     mdkParagraph,      # Paragraph
+    mdkFootnoteDef,    # Footnote definition
+    mdkFootnoteRef,    # Footnote reference
     mdkDocument,       # Root document node
     mdkUnknown         # Unknown or unsupported node
 
@@ -82,6 +84,12 @@ type
         ## Table rows
     of mdkUnknown:
       info*: string # For unknown or unsupported nodes
+    of mdkFootnoteRef:
+      footnoteRefId*: string
+        ## Identifier for the footnote reference
+    of mdkFootnoteDef:
+      footnoteId*: string
+        ## Identifier for the footnote definition
     else: discard
     children*: MarkdownNodeList
       ## Child nodes (for container nodes)
diff --git a/src/marvdown/lexer.nim b/src/marvdown/lexer.nim
@@ -27,6 +27,8 @@ type
     mtkHtml,           # Raw HTML content
     mtkTable,          # Table
     mtkParagraph,      # Paragraph
+    mtkFootnoteRef,    # Footnote reference
+    mtkFootnoteDef,    # Footnote definition
     mtkDocument,       # Root document node
     mtkUnknown         # Unknown or unsupported token
     mtkEOF             # End of file/input
@@ -45,6 +47,7 @@ type
     current*: char
     pos*, line*, col*: int
     strbuf*: string
+    pendingTokens: seq[MarkdownTokenTuple] # Buffer for tokens split from text
 
 #
 # Markdown Lexer
@@ -88,6 +91,37 @@ proc initToken(lex: var MarkdownLexer, kind: MarkdownTokenKind, value: sink stri
 proc newTokenTuple(lex: MarkdownLexer, kind: MarkdownTokenKind, token: string = "", wsno: int = 0, attrs: Option[seq[string]] = none(seq[string])): MarkdownTokenTuple =
   (kind, token, lex.line, lex.col - token.len, lex.pos, wsno, attrs)
 
+proc handleAutoLink(lex: var MarkdownLexer, wsno: int): MarkdownTokenTuple =
+  var tempStrBuf = ""
+  let startPos = lex.pos
+  while lex.current notin {' ', '\t', '\n', '\r', '\0'}:
+    tempStrBuf.add(lex.current)
+    lex.advance()
+  return newTokenTuple(lex, mtkLink, wsno=wsno, attrs=some(@[tempStrBuf, tempStrBuf]))
+
+proc scanTextWithLinks(lex: var MarkdownLexer, wsno: int): seq[MarkdownTokenTuple] =
+  ## Scan plain text and emit mtkText and mtkLink tokens for URLs found anywhere
+  var tokens: seq[MarkdownTokenTuple] = @[]
+  var buf = ""
+  while lex.current notin {'\n', '\r', '\0', '*', '_', '[', ']', '!', '`', '<'}:
+    # Check for http(s):// at current position
+    if lex.current == 'h' and lex.peek() == 't' and lex.peek(2) == 't' and lex.peek(3) == 'p':
+      let isHttp = lex.peek(4) == ':' and lex.peek(5) == '/' and lex.peek(6) == '/'
+      let isHttps = lex.peek(4) == 's' and lex.peek(5) == ':' and lex.peek(6) == '/' and lex.peek(7) == '/'
+      if isHttp or isHttps:
+        # Flush buffer as text token
+        if buf.len > 0:
+          tokens.add(newTokenTuple(lex, mtkText, buf, wsno=wsno))
+          buf.setLen(0)
+        # Handle link
+        tokens.add(lex.handleAutoLink(wsno))
+        continue
+    buf.add(lex.current)
+    lex.advance()
+  if buf.len > 0:
+    tokens.add(newTokenTuple(lex, mtkText, buf, wsno=wsno))
+  return tokens
+
 proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
   ## Lex the next token from the input
   var wsno = 0
@@ -117,6 +151,12 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
 
   # let startCol = wsno # not needed anymore
 
+  # Return buffered tokens if present
+  if lex.pendingTokens.len > 0:
+    let tok = lex.pendingTokens[0]
+    lex.pendingTokens = lex.pendingTokens[1..^1]
+    return tok
+
   case lex.current
   of '#':
     # Headings (e.g., ## Heading 2)
@@ -179,6 +219,7 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
         lex.advance(); lex.advance() # skip both delimiters
         return newTokenTuple(lex, mtkStrong, wsno=wsno)
       else:
+        lex.advance();
         return newTokenTuple(lex, mtkEmphasis, wsno=wsno)
     else:
       return newTokenTuple(lex, mtkText, repeat(ch, count), wsno=wsno)
@@ -294,7 +335,34 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
       lex.advance()
       return newTokenTuple(lex, mtkText, text, wsno=wsno)
   of '[':
-    # Link or Checkbox
+    # Link, Checkbox, or Footnote
+    if lex.peek() == '^':
+      # Footnote reference or definition
+      lex.advance() # skip '['
+      lex.advance() # skip '^'
+      lex.strbuf.setLen(0)
+      while lex.current != ']' and lex.current != '\0':
+        lex.strbuf.add(lex.current)
+        lex.advance()
+      let footId = lex.strbuf
+      if lex.current == ']':
+        lex.advance()
+        if lex.current == ':' and (lex.peek() == ' ' or lex.peek() == '\t'):
+          # Footnote definition: [^id]: text
+          lex.advance() # skip ':'
+          while lex.current == ' ' or lex.current == '\t':
+            lex.advance()
+          lex.strbuf.setLen(0)
+          while lex.current notin {'\n', '\r', '\0'}:
+            lex.strbuf.add(lex.current)
+            lex.advance()
+          return newTokenTuple(lex, mtkFootnoteDef,
+                    lex.strbuf.strip(), wsno=wsno, attrs=some(@[footId]))
+        else:
+          # Footnote reference: [^id]
+          return newTokenTuple(lex, mtkFootnoteRef, "",
+                    wsno=wsno, attrs=some(@[footId]))
+    # Regular link or checkbox
     lex.advance()
     lex.strbuf.setLen(0)
     while lex.current != ']' and lex.current != '\0':
@@ -348,6 +416,7 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
       lex.advance(); lex.advance()
       return newTokenTuple(lex, mtkStrong, wsno=wsno)
     else:
+      lex.advance();
       return newTokenTuple(lex, mtkEmphasis, wsno=wsno)
   of ' ':
     # Line break (two or more spaces at end of line)
@@ -390,11 +459,10 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
     return newTokenTuple(lex, mtkTable, lex.strbuf, wsno=wsno)
   else:
     # Paragraph or plain text
-    lex.strbuf.setLen(0)
-    # Stop at markdown delimiters
-    while lex.current notin {'\n', '\r', '\0', '*', '_', '[', ']', '!', '`', '<'}:
-      lex.strbuf.add(lex.current)
-      lex.advance()
-    if lex.strbuf.len > 0:
-      return newTokenTuple(lex, mtkText, lex.strbuf, wsno=wsno)
+    # Scan for auto links anywhere in the text
+    let tokens = lex.scanTextWithLinks(wsno)
+    if tokens.len > 0:
+      if tokens.len > 1:
+        lex.pendingTokens = tokens[1..^1]
+      return tokens[0]
     return newTokenTuple(lex, mtkUnknown, wsno=wsno)
diff --git a/src/marvdown/parser.nim b/src/marvdown/parser.nim
@@ -31,6 +31,10 @@ type
       ## Internal: Counter for generating unique selectors
     ast*: seq[MarkdownNode]
       ## The abstract syntax tree (AST) of the parsed markdown document
+    footnotes: OrderedTableRef[string, MarkdownNode]
+      ## Footnote definitions parsed from the document
+    footnotesHtml*: string
+      ## Generated HTML for footnotes at the end of the document
 
   TagType* = enum
     tagNone,       # No tags allowed
@@ -54,6 +58,10 @@ type
       ## For allowing use of `style` attribute, enable `allowInlineStyle`.
     enableAnchors*: bool
       ## Enable anchor generation in title blocks (enabled by default)
+    anchorIcon*: string = "🔗"
+      ## Icon used for anchor links in headings
+    showFootnotes*: bool = true
+      ## Insert footnotes HTML at the end of the document (default: true)
 
 #
 # forward declarations
@@ -503,6 +511,26 @@ let defaultOptions = MarkdownOptions(
   enableAnchors: true
 )
 
+proc parseFootnoteDef(md: var Markdown): MarkdownNode = 
+  ## Parse a footnote definition into a MarkdownNode
+  let id = md.parser.curr.attrs.get()[0]
+  let content = md.parser.curr.token.strip()
+  result = MarkdownNode(
+    kind: mdkFootnoteDef,
+    footnoteId: id,
+    children: MarkdownNodeList(),
+    line: md.parser.curr.line,
+    wsno: md.parser.curr.wsno
+  )
+  # Parse inline content of the footnote definition
+  for n in md.parseInline(content):
+    result.children.items.add(n)
+  
+  # Store the footnote definition in the Markdown instance
+  if md.footnotes.isNil:
+    md.footnotes = newOrderedTable[string, MarkdownNode]()
+  md.footnotes[id] = result
+
 proc parseMarkdown(md: var Markdown, currentParagraph: var MarkdownNode) =
   while md.parser.curr.kind != mtkEOF:
     let curr = md.parser.curr
@@ -619,6 +647,22 @@ proc parseMarkdown(md: var Markdown, currentParagraph: var MarkdownNode) =
       closeCurrentParagraph()
       let bqNode = md.parseBlockquote()
       md.ast.add(bqNode)
+    of mtkFootnoteRef:
+      withCurrentParagraph do:
+        let id = curr.attrs.get()[0]
+        let fnNode = MarkdownNode(
+          kind: mdkFootnoteRef,
+          footnoteRefId: id,
+          line: curr.line,
+          wsno: curr.wsno
+        )
+        currentParagraph.children.items.add(fnNode)
+      md.advance()
+    of mtkFootnoteDef:
+      closeCurrentParagraph() # close any open paragraph
+      let node = md.parseFootnoteDef()
+      md.ast.add(node)
+      md.advance()
     else:
       closeCurrentParagraph()
       md.advance()
@@ -645,14 +689,24 @@ proc toHtml*(md: var Markdown): string =
   ## Convert the parsed Markdown AST to HTML
   for node in md.ast:
     add result, md.renderNode(node)
+  if md.opts.showFootnotes and md.footnotesHtml.len > 0:
+    add result, "<hr><div class=\"footnotes\">" & md.footnotesHtml & "</div>"
 
 proc getSelectors*(md: Markdown): OrderedTableRef[string, string] =
   ## Get the headline selectors (anchors) from the parsed Markdown
   md.selectors
 
 proc hasSelectors*(md: Markdown): bool =
   ## Check if there are any headline selectors (anchors) in the parsed Markdown
-  md.selectors.len > 0
+  md.selectors != nil and md.selectors.len > 0
+
+proc getFootnotes*(md: Markdown): OrderedTableRef[string, MarkdownNode] =
+  ## Get the footnote definitions from the parsed Markdown
+  md.footnotes
+
+proc hasFootnotes*(md: Markdown): bool =
+  ## Check if there are any footnote definitions in the parsed Markdown
+  md.footnotes != nil and md.footnotes.len > 0
 
 proc getTitle*(md: Markdown): string =
   ## Retrieve the first heading as the document title
@@ -685,7 +739,11 @@ proc renderNode(md: var Markdown, node: MarkdownNode): string =
     var linkContent = ""
     for child in node.children.items:
       linkContent.add(md.renderNode(child))
-    result = a(href=node.linkHref, title=node.linkTitle, linkContent)
+    result =
+      if node.linkTitle.len > 0:
+        a(href=node.linkHref, title=node.linkTitle, linkContent)
+      else:
+        a(href=node.linkHref, linkContent)
   of mdkImage:
     result = img(src=node.imageSrc, alt=node.imageAlt, title=node.imageTitle)
   of mdkList:
@@ -719,7 +777,9 @@ proc renderNode(md: var Markdown, node: MarkdownNode): string =
       else: # first occurrence
         md.selectorCounter[anchor] = 1
         md.selectors[anchor] = anchor
-      let anchorlink = a(href="#" & anchor, `class`="anchor-link", "🔗")
+      let anchorlink =
+            a(href="#" & anchor, `class`="anchor-link",
+                    md.opts.anchorIcon)
       add result,
         case node.level
         of 1: h1(id=anchor, anchorlink, innerContent)
@@ -768,5 +828,15 @@ proc renderNode(md: var Markdown, node: MarkdownNode): string =
     for child in node.children.items:
       bqContent.add(md.renderNode(child))
     result = "<blockquote>" & bqContent & "</blockquote>"
+  of mdkFootnoteRef:
+    # Footnote reference rendering
+    result = "<sup class=\"footnote-ref\"><a href=\"#fn-" & node.footnoteRefId & "\">" & node.footnoteRefId & "</a></sup>"
+  of mdkFootnoteDef:
+    # Footnote definition rendering (could be customized)
+    var fnContent = ""
+    for child in node.children.items:
+      fnContent.add(md.renderNode(child))
+    md.footnotesHtml.add("<div class=\"footnote\" id=\"fn-" & node.footnoteId & "\">" &
+             "<sup>" & node.footnoteId & "</sup> " & fnContent & "</div>")
   else:
     discard