Skip to content

Commit 913f480

Browse files
stenhougaardclaude
andcommitted
Add attribute extraction support with @Attribute syntax
Implemented attribute extraction for CSS selectors using @ syntax: Features: - Syntax: "selector@attribute" (e.g., "a@href", "img@src", "img@alt") - Works in both extractText() and extractHtml() functions - Supports any HTML attribute (href, src, alt, title, data-*, etc.) - Returns attribute values instead of text content or HTML - Maintains single value vs array logic based on element count Examples: - "a@href" → Extract link URLs - "img@src" → Extract image sources - "img@alt" → Extract alt text - ".article-headline a@href" → Extract links from specific articles This enables the naming convention system in bookmarklets: { "headlines": ".article-headline", "headlines_link": ".article-headline a@href", "thumbnails_src": "img.thumbnail@src" } 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 48464d5 commit 913f480

File tree

1 file changed

+35
-3
lines changed

1 file changed

+35
-3
lines changed

src/utils/parser.ts

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,34 +14,66 @@ export function parseHtml(html: string): Document {
1414
return doc;
1515
}
1616

17+
/**
18+
* Parse selector to check for attribute extraction syntax (selector@attribute)
19+
*/
20+
function parseSelector(selector: string): { selector: string; attribute?: string } {
21+
const match = selector.match(/^(.+)@(\w+)$/);
22+
if (match) {
23+
return {
24+
selector: match[1].trim(),
25+
attribute: match[2],
26+
};
27+
}
28+
return { selector };
29+
}
30+
1731
/**
1832
* Extract text content from elements matching a CSS selector
33+
* Supports attribute extraction with @attribute syntax (e.g., "a@href", "img@src")
1934
* Returns single string if one element, array if multiple, empty string if none
2035
*/
2136
export function extractText(doc: Document, selector: string): string | string[] {
22-
const elements = doc.querySelectorAll(selector);
37+
const { selector: cssSelector, attribute } = parseSelector(selector);
38+
39+
const elements = doc.querySelectorAll(cssSelector);
2340

2441
if (elements.length === 0) {
2542
return "";
2643
}
2744

28-
const texts = Array.from(elements).map((el) => el.textContent?.trim() || "");
45+
const texts = Array.from(elements).map((el) => {
46+
if (attribute) {
47+
// Extract attribute value
48+
return (el as Element).getAttribute(attribute) || "";
49+
}
50+
// Extract text content
51+
return el.textContent?.trim() || "";
52+
});
2953

3054
return texts.length === 1 ? texts[0] : texts;
3155
}
3256

3357
/**
3458
* Extract HTML content (outerHTML) from elements matching a CSS selector
59+
* Supports attribute extraction with @attribute syntax (e.g., "a@href", "img@src")
3560
* Returns single string if one element, array if multiple, empty string if none
3661
*/
3762
export function extractHtml(doc: Document, selector: string): string | string[] {
38-
const elements = doc.querySelectorAll(selector);
63+
const { selector: cssSelector, attribute } = parseSelector(selector);
64+
65+
const elements = doc.querySelectorAll(cssSelector);
3966

4067
if (elements.length === 0) {
4168
return "";
4269
}
4370

4471
const htmls = Array.from(elements).map((el) => {
72+
if (attribute) {
73+
// Extract attribute value
74+
return (el as Element).getAttribute(attribute) || "";
75+
}
76+
// Extract outerHTML
4577
// deno-dom Element has outerHTML property
4678
return (el as Element & { outerHTML: string }).outerHTML || "";
4779
});

0 commit comments

Comments
 (0)