Add attribute extraction support with @Attribute syntax

stenhougaard · claude · stenhougaard · commit 913f4800cfda · 2025-11-16T14:49:38.000+01:00
Implemented attribute extraction for CSS selectors using @ syntax: Features: - Syntax: "selector@attribute" (e.g., "a@href", "img@src", "img@alt") - Works in both extractText() and extractHtml() functions - Supports any HTML attribute (href, src, alt, title, data-*, etc.) - Returns attribute values instead of text content or HTML - Maintains single value vs array logic based on element count Examples: - "a@href" → Extract link URLs - "img@src" → Extract image sources - "img@alt" → Extract alt text - ".article-headline a@href" → Extract links from specific articles This enables the naming convention system in bookmarklets: { "headlines": ".article-headline", "headlines_link": ".article-headline a@href", "thumbnails_src": "img.thumbnail@src" } 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/utils/parser.ts b/src/utils/parser.ts
@@ -14,34 +14,66 @@ export function parseHtml(html: string): Document {
   return doc;
 }
 
+/**
+ * Parse selector to check for attribute extraction syntax (selector@attribute)
+ */
+function parseSelector(selector: string): { selector: string; attribute?: string } {
+  const match = selector.match(/^(.+)@(\w+)$/);
+  if (match) {
+    return {
+      selector: match[1].trim(),
+      attribute: match[2],
+    };
+  }
+  return { selector };
+}
+
 /**
  * Extract text content from elements matching a CSS selector
+ * Supports attribute extraction with @attribute syntax (e.g., "a@href", "img@src")
  * Returns single string if one element, array if multiple, empty string if none
  */
 export function extractText(doc: Document, selector: string): string | string[] {
-  const elements = doc.querySelectorAll(selector);
+  const { selector: cssSelector, attribute } = parseSelector(selector);
+
+  const elements = doc.querySelectorAll(cssSelector);
 
   if (elements.length === 0) {
     return "";
   }
 
-  const texts = Array.from(elements).map((el) => el.textContent?.trim() || "");
+  const texts = Array.from(elements).map((el) => {
+    if (attribute) {
+      // Extract attribute value
+      return (el as Element).getAttribute(attribute) || "";
+    }
+    // Extract text content
+    return el.textContent?.trim() || "";
+  });
 
   return texts.length === 1 ? texts[0] : texts;
 }
 
 /**
  * Extract HTML content (outerHTML) from elements matching a CSS selector
+ * Supports attribute extraction with @attribute syntax (e.g., "a@href", "img@src")
  * Returns single string if one element, array if multiple, empty string if none
  */
 export function extractHtml(doc: Document, selector: string): string | string[] {
-  const elements = doc.querySelectorAll(selector);
+  const { selector: cssSelector, attribute } = parseSelector(selector);
+
+  const elements = doc.querySelectorAll(cssSelector);
 
   if (elements.length === 0) {
     return "";
   }
 
   const htmls = Array.from(elements).map((el) => {
+    if (attribute) {
+      // Extract attribute value
+      return (el as Element).getAttribute(attribute) || "";
+    }
+    // Extract outerHTML
     // deno-dom Element has outerHTML property
     return (el as Element & { outerHTML: string }).outerHTML || "";
   });