diff --git a/package.json b/package.json index 332a087ac..2275dd3a8 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "dompurify": "^3.3.1", "express": "^5.2.1", "isomorphic-dompurify": "^2.35.0", - "rss-parser": "^3.13.0" + "rss-parser": "^3.13.0", + "sanitize-html": "^2.17.0" } } diff --git a/src/aggregator.js b/src/aggregator.js index 7a1625e1d..3197bb703 100644 --- a/src/aggregator.js +++ b/src/aggregator.js @@ -2,6 +2,7 @@ const Parser = require('rss-parser'); const axios = require('axios'); const { Octokit } = require('@octokit/rest'); +const sanitizeHtml = require('sanitize-html'); const parser = new Parser({ timeout: 10000, @@ -28,6 +29,17 @@ function addUTMParams(url, category = 'general') { return url.includes('?') ? `${url}&${utmParams}` : `${url}?${utmParams}`; } +// Robust HTML sanitization: strip all tags and unsafe content +function sanitizeText(input) { + if (!input) { + return ''; + } + return sanitizeHtml(input, { + allowedTags: [], + allowedAttributes: {}, + }); +} + /** * Smart truncate: cut at last punctuation before limit * Avoids cutting words in the middle @@ -65,10 +77,10 @@ function smartTruncate(text, maxLength = 500) { // Sanitize and process articles function sanitizeArticle(article, sourceName, tags, category) { - const rawSummary = article.contentSnippet?.replace(/<[^>]*>/g, '') || ''; + const rawSummary = sanitizeText(article.contentSnippet) || ''; return { - title: article.title?.replace(/<[^>]*>/g, '').slice(0, 200) || 'Untitled', + title: (sanitizeText(article.title) || 'Untitled').slice(0, 200), link: addUTMParams(article.link, category), // UTM tracks traffic FROM AI-Pulse pubDate: new Date(article.pubDate || Date.now()), source: sourceName,