From 15917de8faad9e50c97c5dfa463bb6678d4fc0b0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:09:35 +0000 Subject: [PATCH 1/4] Initial plan From 187820d70145b53445e313c47b52d7b28cfe9abb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:13:55 +0000 Subject: [PATCH 2/4] Remove Leon's scraper - outside Milwaukee area and not working well Co-authored-by: jjlauterbach <1447549+jjlauterbach@users.noreply.github.com> --- app/locations.yaml | 11 - app/scrapers/leons.py | 413 -------------------- scripts/generate_flavors.py | 2 - static/index.html | 4 - tests/test_leons_scraper.py | 733 ------------------------------------ 5 files changed, 1163 deletions(-) delete mode 100644 app/scrapers/leons.py delete mode 100644 tests/test_leons_scraper.py diff --git a/app/locations.yaml b/app/locations.yaml index bf21ec4..f9bcc9c 100644 --- a/app/locations.yaml +++ b/app/locations.yaml @@ -87,17 +87,6 @@ gilles: url: "https://gillesfrozencustard.com/flavor-of-the-day" enabled: true -leons: - - id: leons-oshkosh - name: "Leon's Frozen Custard" - brand: Leons - address: "121 W Murdock Ave, Oshkosh, WI, United States, 54901" - lat: 44.080552821673784 - lng: -88.53003838993757 - url: "https://www.facebook.com/LeonsCustard/" - facebook: "https://www.facebook.com/LeonsCustard/" - enabled: true - bigdeal: - id: bigdeal-main name: "Big Deal Burgers" diff --git a/app/scrapers/leons.py b/app/scrapers/leons.py deleted file mode 100644 index 8a9ba1c..0000000 --- a/app/scrapers/leons.py +++ /dev/null @@ -1,413 +0,0 @@ -"""Scraper for Leon's Frozen Custard using Playwright to scrape Facebook.""" - -import html -import re -import time - -from playwright.sync_api import Error as PlaywrightError -from playwright.sync_api import TimeoutError as PlaywrightTimeoutError -from playwright.sync_api import sync_playwright - -from app.scrapers.scraper_base import USER_AGENT, BaseScraper -from app.scrapers.utils import is_facebook_post_from_today - - -class LeonsScraper(BaseScraper): - """Scraper for Leon's Frozen Custard Facebook page.""" - - # Facebook page timeouts - configured for slow-loading pages with anti-bot measures - NAVIGATION_TIMEOUT = 60000 # 60 seconds for page navigation - SELECTOR_TIMEOUT = 30000 # 30 seconds for selector wait - MAX_RETRIES = 3 # Number of retry attempts for transient Playwright errors (including timeouts) - RETRY_BASE_DELAY = ( - 2 # Base delay multiplied by 2^attempt (produces 2s, 4s delays for 3 total attempts) - ) - - def __init__(self): - super().__init__("leons") - - def scrape(self): - """Scrape Leon's Facebook page for today's flavor.""" - self.log_start() - - if not self.locations: - self.log_error("No locations found") - return [] - - location = self.locations[0] - location_name = location.get("name", "Leon's Frozen Custard") - facebook_url = location.get("facebook") - - if not facebook_url: - self.log_error("No Facebook URL found in location config") - return [] - - try: - self.log_location(location_name, facebook_url) - flavor_text = self._scrape_facebook_page(facebook_url) - - if not flavor_text: - self.logger.warning("⚠️ LEONS: No flavor post found on Facebook") - return [] - - # Parse the flavor name from the post - flavor_name = self._extract_flavor_name(flavor_text) - - if not flavor_name: - self.logger.warning(f"⚠️ LEONS: Could not parse flavor from: {flavor_text[:100]}") - return [] - - self.log_flavor(location_name, flavor_name) - - flavor_entry = self.create_flavor( - location_name=location_name, - flavor=flavor_name, - description=None, - url=location.get("url"), - location_id=location.get("id"), - lat=location.get("lat"), - lng=location.get("lng"), - address=location.get("address"), - ) - - self.log_complete(1) - return [flavor_entry] - - except Exception as e: - self.log_error(f"Error scraping Leon's: {e}", exc_info=True) - return [] - - def _scrape_facebook_page(self, url): - """ - Use Playwright to scrape Leon's Facebook page with retry logic. - - Args: - url: Facebook page URL - - Returns: - str: Text content of the most recent flavor post, or None if not found - """ - for attempt in range(self.MAX_RETRIES): - try: - return self._scrape_facebook_page_attempt(url, attempt) - except PlaywrightTimeoutError as e: - # Timeout errors are transient and should be retried - if not self._handle_retry(attempt, f"Timeout: {e}"): - return None - except PlaywrightError as e: - # Other Playwright errors (network, page crash, etc.) may be transient - if not self._handle_retry(attempt, f"Playwright error: {e}"): - return None - except Exception as e: - # Unexpected errors should not be retried - self.logger.error(f"Unexpected error on attempt {attempt + 1}: {e}", exc_info=True) - return None - return None - - def _handle_retry(self, attempt, error_message): - """ - Handle retry logic with exponential backoff. - - Args: - attempt: Current attempt number (0-indexed) - error_message: Error message to log - - Returns: - bool: True if should retry, False if max retries reached - """ - if attempt < self.MAX_RETRIES - 1: - delay = self.RETRY_BASE_DELAY * (2**attempt) - self.logger.warning( - f"{error_message} on attempt {attempt + 1}/{self.MAX_RETRIES}. " - f"Retrying in {delay}s..." - ) - time.sleep(delay) - return True - - self.logger.error(f"{error_message} after {self.MAX_RETRIES} attempts") - return False - - def _scrape_facebook_page_attempt(self, url, attempt): - """ - Single attempt to scrape Facebook page. - - Args: - url: Facebook page URL - attempt: Current attempt number (0-indexed) - - Returns: - str: Text content of the most recent flavor post, or None if not found - - Raises: - PlaywrightTimeoutError: If page load or selector wait times out - Exception: For other errors - """ - with sync_playwright() as p: - browser = None - try: - # Launch browser in headless mode - browser = p.chromium.launch(headless=True) - context = browser.new_context(user_agent=USER_AGENT) - page = context.new_page() - - # Navigate to Facebook page with extended timeout - self.logger.debug( - f"Loading Facebook page (attempt {attempt + 1}): {url} " - f"(timeout: {self.NAVIGATION_TIMEOUT}ms)" - ) - page.goto(url, wait_until="networkidle", timeout=self.NAVIGATION_TIMEOUT) - - # Wait for posts to load with extended timeout - self.logger.debug(f"Waiting for posts to load (timeout: {self.SELECTOR_TIMEOUT}ms)") - page.wait_for_selector('[role="article"]', timeout=self.SELECTOR_TIMEOUT) - - # Scroll down to load more posts - try: - page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") - page.wait_for_timeout(1000) # Let posts load - except Exception: - pass - - # Get all post articles first - all_articles = page.query_selector_all('[role="article"]') - self.logger.debug(f"Found {len(all_articles)} total articles (including comments)") - - # Filter out nested articles (comments) - only keep top-level posts - # Comments are article elements nested within post article elements - top_level_articles = [] - for article in all_articles: - # Check if this article is nested within another article - # by looking for a parent with role="article" - try: - parent_article = article.evaluate( - """(element) => { - let parent = element.parentElement; - while (parent) { - if (parent.getAttribute('role') === 'article' && parent !== element) { - return true; - } - parent = parent.parentElement; - } - return false; - }""" - ) - if not parent_article: - top_level_articles.append(article) - except Exception: - # If evaluation fails, include the article to be safe - top_level_articles.append(article) - - self.logger.debug( - f"Filtered to {len(top_level_articles)} top-level posts (excluding comments)" - ) - - # Now expand "See more" links ONLY in top-level posts to reveal full content - expanded_count = 0 - for idx, article in enumerate(top_level_articles[:10]): # Process first 10 posts - try: - # Look for "See more" button within this specific article - # Try multiple selectors as Facebook structure can vary - see_more = None - selectors = [ - 'div[role="button"]:has-text("See more")', - '[role="button"]:has-text("See more")', - 'text="See more"', - ] - - for selector in selectors: - try: - see_more = article.query_selector(selector) - if see_more: - break - except Exception: - continue - - if see_more and see_more.is_visible(): - self.logger.debug(f"Expanding 'See more' in post {idx}") - see_more.click() - page.wait_for_timeout(500) # Wait for expansion - expanded_count += 1 - else: - self.logger.debug(f"No 'See more' button found in post {idx}") - except Exception as e: - self.logger.debug(f"Could not expand 'See more' in article {idx}: {e}") - pass # Continue if expansion fails - - self.logger.debug( - f"Expanded {expanded_count} 'See more' buttons in top-level posts" - ) - - # Wait a bit longer after all expansions to let content fully render - if expanded_count > 0: - page.wait_for_timeout(1000) - - # Look through recent posts for flavor information - for i, article in enumerate(top_level_articles[:10]): # Check first 10 posts - # Fetch inner text once for both date validation and content processing - try: - text_content = article.inner_text() - if not text_content or text_content.strip() == "": - self.logger.debug(f"Post {i}: Empty content, skipping") - continue - except Exception as e: - self.logger.debug(f"Post {i}: Error getting text content: {e}") - continue - - # Check if post is from today using pre-fetched text - if not is_facebook_post_from_today( - article, self.logger, article_text=text_content - ): - self.logger.debug(f"Post {i} is not from today, skipping") - continue - - text_lower = text_content.lower() - - # Log a preview of the post content - preview = text_content[:100].replace("\n", " ") - self.logger.debug(f"Post {i}: {preview}...") - - # For debugging, log if the text seems unusually short - if len(text_content) < 20: - self.logger.debug( - f"Post {i}: Unusually short content (length={len(text_content)}): {repr(text_content)}" - ) - - # Check if this post mentions flavor and is specifically about today/of the day - if "flavor" in text_lower and any( - keyword in text_lower for keyword in ["today", "daily", "of the day"] - ): - # Additional check: make sure it's announcing a flavor, not just mentioning it - # Look for patterns like "is our flavor", "is the flavor", "flavor of the day:" - if any( - pattern in text_lower - for pattern in [ - "is our flavor", - "is the flavor", - "flavor of the day:", - "flavor today:", - "today's flavor", - "daily flavor", - ] - ): - self.logger.debug(f"Found flavor post at index {i}") - return text_content - else: - self.logger.debug( - f"Post {i} has 'flavor' but doesn't appear to be announcing one" - ) - - self.logger.warning("No recent flavor post found in first 10 posts") - return None - - finally: - if browser: - try: - browser.close() - except Exception: - pass - - def _sanitize_flavor_name(self, flavor): - """ - Sanitize a flavor name by decoding HTML entities, removing emojis, and cleaning punctuation. - - This method: - - Strips leading/trailing whitespace and common leading punctuation (:,-) - - Decodes HTML entities (e.g., & → &) - - Removes emojis and truncates everything after the first emoji - - Truncates content after common terminators (!, ., double-space) - - Args: - flavor: Raw flavor name extracted from text - - Returns: - str: Sanitized flavor name - """ - # Strip leading/trailing whitespace and common leading punctuation - flavor = flavor.strip().lstrip(":,-") - # Decode HTML entities - flavor = html.unescape(flavor) - # Remove emojis and everything after them - # Covers: Emoticons, Transport/Map, Misc Symbols, Pictographs, Dingbats - flavor = re.sub( - r"\s*[\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F300-\U0001F5FF" - r"\U0001F900-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+.*$", - "", - flavor, - ) - # Remove content after common terminators (for non-emoji cases) - # Split on '!', double space, or '.' and take first part - flavor = re.split(r"[!.]| ", flavor)[0].strip() - return flavor - - def _extract_flavor_name(self, text): - """ - Extract the flavor name from a Facebook post. - - Args: - text: Full text of the Facebook post - - Returns: - str: Extracted flavor name, or None if not found - """ - self.logger.debug(f"Extracting flavor from text: {text[:200]}") - - # Try various patterns to extract the flavor - patterns = [ - # "BUTTER PECAN is our flavor of the day" - flavor comes BEFORE - r"([A-Z][A-Z\s&]+?)\s+is\s+(?:our\s+)?(?:the\s+)?flavor(?:\s+of\s+the\s+day)?", - # "Flavor of the Day: Chocolate" or "Flavor: Chocolate" - flavor comes AFTER - r"flavor(?:\s+of\s+the\s+day)?[\s:]+(?:is\s+)?([A-Z][^\n.!?]+?)(?:\n|$|!|\.| )", - # "Today's flavor: Chocolate" - r"today'?s?\s+flavor[\s:]+(?:is\s+)?([A-Z][^\n.!?]+?)(?:\n|$|!|\.| )", - # "Today: Chocolate" or "Flavor Today: Chocolate" - r"(?:flavor\s+)?today[\s:]+([A-Z][^\n.!?]+?)(?:\n|$|!|\.| )", - ] - - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) - if match: - flavor = match.group(1).strip() - flavor = self._sanitize_flavor_name(flavor) - - # Sanity check: make sure it's a reasonable flavor name - if 3 < len(flavor) < 100 and not flavor.lower().startswith("of the"): - self.logger.debug(f"Extracted flavor using pattern: {flavor}") - return flavor - - # Fallback: Look for flavor name in a structured way - lines = text.split("\n") - for i, line in enumerate(lines): - lower_line = line.lower() - if "flavor" in lower_line: - # Try the next line first (common format) - if i + 1 < len(lines): - next_line = lines[i + 1].strip() - if next_line and len(next_line) > 3 and len(next_line) < 100: - # Check if it looks like a flavor name (starts with capital) - if next_line[0].isupper(): - next_line = self._sanitize_flavor_name(next_line) - self.logger.debug(f"Extracted flavor from next line: {next_line}") - return next_line - - # Try to extract from the same line - cleaned = re.sub( - r".*?flavor(?:\s+of\s+the\s+day)?[\s:]*", "", line, flags=re.IGNORECASE - ) - cleaned = self._sanitize_flavor_name(cleaned) - if cleaned and 3 < len(cleaned) < 100 and not cleaned.lower().startswith("is"): - self.logger.debug(f"Extracted flavor from same line: {cleaned}") - return cleaned - - self.logger.warning("Could not extract flavor name using any method") - return None - - -def scrape_leons(): - """ - Standalone function to scrape Leon's Frozen Custard. - - Returns: - list: List of flavor dicts - """ - scraper = LeonsScraper() - return scraper.scrape() diff --git a/scripts/generate_flavors.py b/scripts/generate_flavors.py index fba5441..ce23641 100644 --- a/scripts/generate_flavors.py +++ b/scripts/generate_flavors.py @@ -24,7 +24,6 @@ from app.scrapers.culvers import scrape_culvers # noqa: E402 from app.scrapers.gilles import scrape_gilles # noqa: E402 from app.scrapers.kopps import scrape_kopps # noqa: E402 -from app.scrapers.leons import scrape_leons # noqa: E402 from app.scrapers.murfs import scrape_murfs # noqa: E402 from app.scrapers.oscars import scrape_oscars # noqa: E402 @@ -88,7 +87,6 @@ def scrape_all(): scrape_oscars, scrape_bubbas, scrape_gilles, - scrape_leons, scrape_bigdeal, ] diff --git a/static/index.html b/static/index.html index 0ac4528..b852c12 100644 --- a/static/index.html +++ b/static/index.html @@ -126,10 +126,6 @@

Filter Brands

Gilles -