Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 61 additions & 32 deletions app/scrapers/bigdeal.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,50 +168,79 @@ def _scrape_facebook_page_attempt(self, url, attempt):
except Exception:
pass

# Expand "See more" links to reveal full post content
try:
see_more_buttons = page.query_selector_all('text="See more"')
self.logger.debug(f"Found {len(see_more_buttons)} 'See more' buttons to expand")
for btn in see_more_buttons[:10]: # Expand up to 10 posts
try:
if btn.is_visible():
btn.click()
page.wait_for_timeout(500) # Wait for expansion
except Exception:
pass # Continue if a button fails to click
except Exception as e:
self.logger.debug(f"Could not expand 'See more' buttons: {e}")

# Get all post articles (after expansion)
articles = page.query_selector_all('[role="article"]')

# Filter out nested articles (comments) so we only process top-level posts.
# Facebook uses role="article" for both posts and comments. This logic mirrors
# the approach used in the Leon's scraper to avoid treating comments as posts.
# Get all post articles first
all_articles = page.query_selector_all('[role="article"]')
self.logger.debug(f"Found {len(all_articles)} total articles (including comments)")

# Filter out nested articles (comments) - only keep top-level posts
# Comments are article elements nested within post article elements
top_level_articles = []
for article in articles:
for article in all_articles:
# Check if this article is nested within another article
# by looking for a parent with role="article"
try:
# In the page context, check if this node has an ancestor with role="article"
# that is not itself. If it does, it's a nested article (e.g., a comment).
has_parent_article = article.evaluate(
"""(node) => {
if (!node || !node.parentElement) {
return false;
parent_article = article.evaluate(
"""(element) => {
let parent = element.parentElement;
while (parent) {
if (parent.getAttribute('role') === 'article' && parent !== element) {
return true;
}
parent = parent.parentElement;
}
const parentArticle = node.parentElement.closest('[role="article"]');
return parentArticle !== null && parentArticle !== node;
return false;
}"""
)
if not has_parent_article:
if not parent_article:
top_level_articles.append(article)
except Exception:
# If anything goes wrong during evaluation, fall back to keeping the article
# If evaluation fails, include the article to be safe
top_level_articles.append(article)

self.logger.debug(
f"Found {len(articles)} total articles, {len(top_level_articles)} top-level posts on Facebook page"
f"Filtered to {len(top_level_articles)} top-level posts (excluding comments)"
)

# Now expand "See more" links ONLY in top-level posts to reveal full content
expanded_count = 0
for idx, article in enumerate(top_level_articles[:10]): # Process first 10 posts
try:
# Look for "See more" button within this specific article
# Try multiple selectors as Facebook structure can vary
see_more = None
selectors = [
'div[role="button"]:has-text("See more")',
'[role="button"]:has-text("See more")',
'text="See more"',
]

for selector in selectors:
try:
see_more = article.query_selector(selector)
if see_more:
break
except Exception:
continue

if see_more and see_more.is_visible():
self.logger.debug(f"Expanding 'See more' in post {idx}")
see_more.click()
page.wait_for_timeout(500) # Wait for expansion
expanded_count += 1
else:
self.logger.debug(f"No 'See more' button found in post {idx}")
except Exception as e:
self.logger.debug(f"Could not expand 'See more' in article {idx}: {e}")
pass # Continue if expansion fails

self.logger.debug(
f"Expanded {expanded_count} 'See more' buttons in top-level posts"
)

# Wait a bit longer after all expansions to let content fully render
if expanded_count > 0:
page.wait_for_timeout(1000)

# Look through recent posts for flavor information
for i, article in enumerate(top_level_articles[:10]): # Check first 10 posts
# Check if post is from today
Expand Down
73 changes: 37 additions & 36 deletions tests/test_bigdeal_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,27 @@ def tearDown(self):
"""Clean up patches."""
self.locations_patcher.stop()

def _create_mock_article(self, text_content, is_nested=False):
"""
Create a properly mocked article with all required methods.

Args:
text_content: The text content to return from inner_text()
is_nested: Whether this article is nested within another (i.e., a comment)

Returns:
Mock article object
"""
mock_article = Mock()
mock_article.inner_text.return_value = text_content
# Mock evaluate() to return whether article is nested (False = top-level post)
mock_article.evaluate.return_value = is_nested
# Mock query_selector() to return None (no "See more" button)
mock_article.query_selector.return_value = None
# Mock is_visible() in case it's checked
mock_article.is_visible.return_value = True
return mock_article

@patch("app.scrapers.bigdeal.is_facebook_post_from_today")
@patch("app.scrapers.bigdeal.sync_playwright")
def test_scrape_facebook_success_first_post(self, mock_playwright, mock_is_today):
Expand All @@ -281,10 +302,9 @@ def test_scrape_facebook_success_first_post(self, mock_playwright, mock_is_today
mock_browser = Mock()
mock_context = Mock()
mock_page = Mock()
mock_article = Mock()

# Setup article with flavor content
mock_article.inner_text.return_value = "Today's flavor is Vanilla Bean!"
mock_article = self._create_mock_article("Today's flavor is Vanilla Bean!")

# Setup page to return one article
mock_page.query_selector_all.return_value = [mock_article]
Expand Down Expand Up @@ -316,14 +336,9 @@ def test_scrape_facebook_success_third_post(self, mock_playwright, mock_is_today
mock_page = Mock()

# Create 3 articles - first two without flavor keywords
mock_article1 = Mock()
mock_article1.inner_text.return_value = "Happy Monday everyone! Visit us soon."

mock_article2 = Mock()
mock_article2.inner_text.return_value = "Check out our new hours!"

mock_article3 = Mock()
mock_article3.inner_text.return_value = "Today's custard flavor: CHOCOLATE CHIP!"
mock_article1 = self._create_mock_article("Happy Monday everyone! Visit us soon.")
mock_article2 = self._create_mock_article("Check out our new hours!")
mock_article3 = self._create_mock_article("Today's custard flavor: CHOCOLATE CHIP!")

mock_page.query_selector_all.return_value = [mock_article1, mock_article2, mock_article3]

Expand Down Expand Up @@ -371,9 +386,7 @@ def test_scrape_facebook_no_flavor_post(self, mock_playwright, mock_is_today):
# Create articles without flavor content
articles = []
for i in range(5):
mock_article = Mock()
mock_article.inner_text.return_value = f"General post {i} about our restaurant."
articles.append(mock_article)
articles.append(self._create_mock_article(f"General post {i} about our restaurant."))

mock_page.query_selector_all.return_value = articles

Expand Down Expand Up @@ -426,14 +439,9 @@ def test_scrape_facebook_skips_old_posts(self, mock_playwright, mock_is_today):
mock_page = Mock()

# Create 3 articles
mock_article1 = Mock()
mock_article1.inner_text.return_value = "Yesterday's flavor was Chocolate"

mock_article2 = Mock()
mock_article2.inner_text.return_value = "Old post about custard"

mock_article3 = Mock()
mock_article3.inner_text.return_value = "Today's flavor is Vanilla!"
mock_article1 = self._create_mock_article("Yesterday's flavor was Chocolate")
mock_article2 = self._create_mock_article("Old post about custard")
mock_article3 = self._create_mock_article("Today's flavor is Vanilla!")

mock_page.query_selector_all.return_value = [mock_article1, mock_article2, mock_article3]

Expand All @@ -459,8 +467,7 @@ def test_scrape_facebook_scrolls_before_querying(self, mock_playwright, mock_is_
mock_browser = Mock()
mock_context = Mock()
mock_page = Mock()
mock_article = Mock()
mock_article.inner_text.return_value = "Today's flavor is Vanilla Bean!"
mock_article = self._create_mock_article("Today's flavor is Vanilla Bean!")

mock_page.query_selector_all.return_value = [mock_article]
mock_context.new_page.return_value = mock_page
Expand All @@ -479,7 +486,7 @@ def test_scrape_facebook_scrolls_before_querying(self, mock_playwright, mock_is_
@patch("app.scrapers.bigdeal.is_facebook_post_from_today")
@patch("app.scrapers.bigdeal.sync_playwright")
def test_scrape_facebook_page_expands_see_more_buttons(self, mock_playwright, mock_is_today):
"""Test: 'See more' buttons are clicked to expand truncated posts."""
"""Test: 'See more' buttons are expanded per-article (not page-wide)."""
mock_is_today.return_value = True

mock_browser = Mock()
Expand All @@ -490,16 +497,11 @@ def test_scrape_facebook_page_expands_see_more_buttons(self, mock_playwright, mo
mock_see_more_btn = Mock()
mock_see_more_btn.is_visible.return_value = True

mock_article = Mock()
mock_article.inner_text.return_value = "Today's flavor is Vanilla Bean!"

# Route query_selector_all calls by selector argument
def selector_side_effect(selector):
if selector == 'text="See more"':
return [mock_see_more_btn]
return [mock_article]
# Article returns a "See more" button when queried with per-article selector
mock_article = self._create_mock_article("Today's flavor is Vanilla Bean!")
mock_article.query_selector.return_value = mock_see_more_btn

mock_page.query_selector_all.side_effect = selector_side_effect
mock_page.query_selector_all.return_value = [mock_article]

mock_context.new_page.return_value = mock_page
mock_browser.new_context.return_value = mock_context
Expand All @@ -509,7 +511,7 @@ def selector_side_effect(selector):

result = self.scraper._scrape_facebook_page("https://facebook.com/test")

# Verify "See more" button was clicked to expand truncated content
# Verify "See more" button was clicked per-article to expand truncated content
mock_see_more_btn.click.assert_called_once()
self.assertEqual(result, "Today's flavor is Vanilla Bean!")

Expand All @@ -524,8 +526,7 @@ def test_scrape_facebook_browser_close_error(self, mock_playwright, mock_is_toda
mock_context = Mock()
mock_page = Mock()

mock_article = Mock()
mock_article.inner_text.return_value = "Today's flavor: Strawberry"
mock_article = self._create_mock_article("Today's flavor: Strawberry")

mock_page.query_selector_all.return_value = [mock_article]
mock_context.new_page.return_value = mock_page
Expand Down
Loading