Skip to content

Commit a5c8491

Browse files
Dean SoferDean Sofer
authored andcommitted
Clustering
1 parent 48aaad9 commit a5c8491

File tree

3 files changed

+369
-105
lines changed

3 files changed

+369
-105
lines changed

crawler.js

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
async function pageFunction(context) {
2+
const { $, request, log } = context;
3+
4+
5+
6+
const API_KEY = 'AIzaSyA6u8cQEcT8e94ZGBdbdWkC17aUP1etCos';
7+
const pageTitle = $('title').first().text();
8+
const MAP_IFRAME = 'https://www.google.com/maps/embed/v1/place?q=';
9+
const mapIframe = $(`iframe[src^="${MAP_IFRAME}"]`);
10+
// const mapIframe = $('iframe[src]').filter((el) => el.src.startsWith(MAP_IFRAME));
11+
12+
if (mapIframe.length) {
13+
14+
// const location = $('b:contains("Address")').first().next().text();
15+
const location = (new URL(mapIframe.attr('src'))).searchParams.get('q');
16+
let geometry;
17+
try {
18+
// context.log.info(`Geocoding... ${JSON.stringify(location)}`)
19+
results = await fetch(`https://maps.googleapis.com/maps/api/geocode/json?address=${encodeURIComponent(location)}&key=${API_KEY}`)
20+
if (results.ok) {
21+
json = await results.json();
22+
// context.log.info(`Resolved Geocode`, json.results[0]?.geometry.location)
23+
geometry = json.results[0]?.geometry.location;
24+
25+
// If no geometry found, retry with ", san francisco" appended
26+
if (!geometry) {
27+
context.log.info('No geometry found, retrying with ", san francisco" appended');
28+
const locationWithSF = location + ', san francisco';
29+
results = await fetch(`https://maps.googleapis.com/maps/api/geocode/json?address=${encodeURIComponent(locationWithSF)}&key=${API_KEY}`)
30+
if (results.ok) {
31+
json = await results.json();
32+
geometry = json.results[0]?.geometry.location;
33+
}
34+
if (!geometry) {
35+
context.log.error('geometry missing after retry', { results, json })
36+
}
37+
}
38+
} else {
39+
context.log.error('cannot geocode', { results })
40+
}
41+
} catch (error) {
42+
context.log.error('cannot geocode', { error })
43+
}
44+
// Would love to use span.cost but it's occasionally missing...
45+
const date = $('#stats .left a:first').text()
46+
const time = $('#stats .left span:first').contents().first().text().substr(2).trim()
47+
const cost = $('span.cost:last').contents().last().text().trim()
48+
// const cost_details = $('span.cost:first').next().find('.middle').text().trim()
49+
const cost_details = $('.cost_details').text();
50+
const venue = $('#stats .left br:first').next().text()
51+
const eventUrl = $('a[name*="Learn More"], a[name*="Event Details"], a[name*="RSVP"], a[name*="Buy Tickets"]').attr('href')
52+
const categories = $('.entry [rel="category tag"]').toArray().map(el => $(el).text())
53+
54+
55+
// Swap async images with noscript images
56+
$('img[data-spai]').each((i, el) => {
57+
const $el = $(el)
58+
const $noscript = $el.next()
59+
$el.replaceWith($noscript.html())
60+
$noscript.remove()
61+
})
62+
// Fix instagram embeds
63+
$('.instagram-media').each((i, el) => {
64+
const $el = $(el)
65+
let src = $el.attr('data-instgrm-permalink')
66+
src = new URL(src)
67+
src.pathname += 'embed/captioned'
68+
$el.replaceWith(`<iframe src="${src.toString()}" />`)
69+
})
70+
// Remove ads
71+
$('section, style').remove()
72+
73+
let details = $('.at-above-post')
74+
.nextUntil('.at-below-post')
75+
.map((i, el) => $.html(el))
76+
.get().join("\n")
77+
78+
// Prepend wp-post-image if found
79+
try {
80+
const postImage = $('img.wp-post-image');
81+
if (postImage.length && !$('.at-above-post:first-of-type ~ .media-credit-container').length) {
82+
context.log.info('Prepending wp-post-image to details');
83+
details = `${$.html(postImage)}\n${details || ''}`;
84+
}
85+
} catch (error) {
86+
context.log.error('Error prepending wp-post-image to details', { error });
87+
}
88+
89+
// Print some information to actor log
90+
context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
91+
92+
// Manually add a new page to the queue for scraping.
93+
// await context.enqueueRequest({ url: 'http://www.example.com' });
94+
95+
// Return an object with the data extracted from the page.
96+
// It will be stored to the resulting dataset.
97+
return {
98+
url: context.request.url,
99+
eventUrl,
100+
title: pageTitle,
101+
location,
102+
geometry,
103+
date,
104+
date_text: date,
105+
categories: categories,
106+
details,
107+
date_text: date,
108+
venue,
109+
time,
110+
cost,
111+
cost_details,
112+
};
113+
} else {
114+
context.log.info(`SKIPPING URL: ${context.request.url}, TITLE: ${pageTitle}`);
115+
return null;
116+
}
117+
}

0 commit comments

Comments
 (0)