1+ async function pageFunction ( context ) {
2+ const { $, request, log } = context ;
3+
4+
5+
6+ const API_KEY = 'AIzaSyA6u8cQEcT8e94ZGBdbdWkC17aUP1etCos' ;
7+ const pageTitle = $ ( 'title' ) . first ( ) . text ( ) ;
8+ const MAP_IFRAME = 'https://www.google.com/maps/embed/v1/place?q=' ;
9+ const mapIframe = $ ( `iframe[src^="${ MAP_IFRAME } "]` ) ;
10+ // const mapIframe = $('iframe[src]').filter((el) => el.src.startsWith(MAP_IFRAME));
11+
12+ if ( mapIframe . length ) {
13+
14+ // const location = $('b:contains("Address")').first().next().text();
15+ const location = ( new URL ( mapIframe . attr ( 'src' ) ) ) . searchParams . get ( 'q' ) ;
16+ let geometry ;
17+ try {
18+ // context.log.info(`Geocoding... ${JSON.stringify(location)}`)
19+ results = await fetch ( `https://maps.googleapis.com/maps/api/geocode/json?address=${ encodeURIComponent ( location ) } &key=${ API_KEY } ` )
20+ if ( results . ok ) {
21+ json = await results . json ( ) ;
22+ // context.log.info(`Resolved Geocode`, json.results[0]?.geometry.location)
23+ geometry = json . results [ 0 ] ?. geometry . location ;
24+
25+ // If no geometry found, retry with ", san francisco" appended
26+ if ( ! geometry ) {
27+ context . log . info ( 'No geometry found, retrying with ", san francisco" appended' ) ;
28+ const locationWithSF = location + ', san francisco' ;
29+ results = await fetch ( `https://maps.googleapis.com/maps/api/geocode/json?address=${ encodeURIComponent ( locationWithSF ) } &key=${ API_KEY } ` )
30+ if ( results . ok ) {
31+ json = await results . json ( ) ;
32+ geometry = json . results [ 0 ] ?. geometry . location ;
33+ }
34+ if ( ! geometry ) {
35+ context . log . error ( 'geometry missing after retry' , { results, json } )
36+ }
37+ }
38+ } else {
39+ context . log . error ( 'cannot geocode' , { results } )
40+ }
41+ } catch ( error ) {
42+ context . log . error ( 'cannot geocode' , { error } )
43+ }
44+ // Would love to use span.cost but it's occasionally missing...
45+ const date = $ ( '#stats .left a:first' ) . text ( )
46+ const time = $ ( '#stats .left span:first' ) . contents ( ) . first ( ) . text ( ) . substr ( 2 ) . trim ( )
47+ const cost = $ ( 'span.cost:last' ) . contents ( ) . last ( ) . text ( ) . trim ( )
48+ // const cost_details = $('span.cost:first').next().find('.middle').text().trim()
49+ const cost_details = $ ( '.cost_details' ) . text ( ) ;
50+ const venue = $ ( '#stats .left br:first' ) . next ( ) . text ( )
51+ const eventUrl = $ ( 'a[name*="Learn More"], a[name*="Event Details"], a[name*="RSVP"], a[name*="Buy Tickets"]' ) . attr ( 'href' )
52+ const categories = $ ( '.entry [rel="category tag"]' ) . toArray ( ) . map ( el => $ ( el ) . text ( ) )
53+
54+
55+ // Swap async images with noscript images
56+ $ ( 'img[data-spai]' ) . each ( ( i , el ) => {
57+ const $el = $ ( el )
58+ const $noscript = $el . next ( )
59+ $el . replaceWith ( $noscript . html ( ) )
60+ $noscript . remove ( )
61+ } )
62+ // Fix instagram embeds
63+ $ ( '.instagram-media' ) . each ( ( i , el ) => {
64+ const $el = $ ( el )
65+ let src = $el . attr ( 'data-instgrm-permalink' )
66+ src = new URL ( src )
67+ src . pathname += 'embed/captioned'
68+ $el . replaceWith ( `<iframe src="${ src . toString ( ) } " />` )
69+ } )
70+ // Remove ads
71+ $ ( 'section, style' ) . remove ( )
72+
73+ let details = $ ( '.at-above-post' )
74+ . nextUntil ( '.at-below-post' )
75+ . map ( ( i , el ) => $ . html ( el ) )
76+ . get ( ) . join ( "\n" )
77+
78+ // Prepend wp-post-image if found
79+ try {
80+ const postImage = $ ( 'img.wp-post-image' ) ;
81+ if ( postImage . length && ! $ ( '.at-above-post:first-of-type ~ .media-credit-container' ) . length ) {
82+ context . log . info ( 'Prepending wp-post-image to details' ) ;
83+ details = `${ $ . html ( postImage ) } \n${ details || '' } ` ;
84+ }
85+ } catch ( error ) {
86+ context . log . error ( 'Error prepending wp-post-image to details' , { error } ) ;
87+ }
88+
89+ // Print some information to actor log
90+ context . log . info ( `URL: ${ context . request . url } , TITLE: ${ pageTitle } ` ) ;
91+
92+ // Manually add a new page to the queue for scraping.
93+ // await context.enqueueRequest({ url: 'http://www.example.com' });
94+
95+ // Return an object with the data extracted from the page.
96+ // It will be stored to the resulting dataset.
97+ return {
98+ url : context . request . url ,
99+ eventUrl,
100+ title : pageTitle ,
101+ location,
102+ geometry,
103+ date,
104+ date_text : date ,
105+ categories : categories ,
106+ details,
107+ date_text : date ,
108+ venue,
109+ time,
110+ cost,
111+ cost_details,
112+ } ;
113+ } else {
114+ context . log . info ( `SKIPPING URL: ${ context . request . url } , TITLE: ${ pageTitle } ` ) ;
115+ return null ;
116+ }
117+ }
0 commit comments