Skip to content

Commit c41a665

Browse files
authored
Update (#2)
1 parent 8969eb4 commit c41a665

File tree

5 files changed

+56
-21
lines changed

5 files changed

+56
-21
lines changed

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Scrapy_ project built following `Zyte’s web scraping tutorial`_.
1616
Requirements
1717
============
1818

19-
Python 3.8 or higher.
19+
Python 3.9 or higher.
2020

2121

2222
Setup

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
scrapy
1+
scrapy==2.11.2
22
scrapy-zyte-api
33
shub
44
zyte-spider-templates

scrapinghub.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
requirements:
22
file: requirements.txt
33
stacks:
4-
default: scrapy:2.11
4+
default: scrapy:2.11-20241022

tutorial/settings.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
from itemadapter import ItemAdapter
2+
from zyte_common_items import ZyteItemAdapter
3+
4+
ItemAdapter.ADAPTER_CLASSES.appendleft(ZyteItemAdapter)
5+
16
# Scrapy settings for tutorial project
27
#
38
# For simplicity, this file contains only settings considered important or
@@ -96,27 +101,15 @@
96101
FEED_EXPORT_ENCODING = "utf-8"
97102

98103
# Custom settings
99-
DOWNLOAD_HANDLERS = {
100-
"http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler",
101-
"https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler",
104+
ADDONS = {
105+
"scrapy_zyte_api.Addon": 500,
106+
"zyte_spider_templates.Addon": 700,
102107
}
108+
ZYTE_API_KEY = "YOUR_API_KEY"
103109
DOWNLOADER_MIDDLEWARES = {
104-
"scrapy_poet.InjectionMiddleware": 543,
105-
"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000,
110+
"scrapy.downloadermiddlewares.stats.DownloaderStats": None,
111+
"scrapy_poet.DownloaderStatsMiddleware": 850,
106112
}
107-
REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter"
108-
ZYTE_API_TRANSPARENT_MODE = True
109113
SPIDER_MIDDLEWARES = {
110-
"scrapy_zyte_api.ScrapyZyteAPISpiderMiddleware": 100,
111114
"scrapy_poet.RetryMiddleware": 275,
112-
"zyte_spider_templates.middlewares.CrawlingLogsMiddleware": 1000,
113-
}
114-
SCRAPY_POET_DISCOVER = [
115-
"zyte_spider_templates.page_objects",
116-
]
117-
SCRAPY_POET_PROVIDERS = {
118-
"scrapy_zyte_api.providers.ZyteApiProvider": 1100,
119115
}
120-
CLOSESPIDER_TIMEOUT_NO_ITEM = 600
121-
SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue"
122-
SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import json
2+
from base64 import b64decode
3+
4+
from scrapy import Request, Spider
5+
6+
7+
class QuotesToScrapeComScrollCaptureSpider(Spider):
8+
name = "quotes_toscrape_com_scroll_capture"
9+
10+
def start_requests(self):
11+
yield Request(
12+
"http://quotes.toscrape.com/scroll",
13+
meta={
14+
"zyte_api_automap": {
15+
"browserHtml": True,
16+
"actions": [
17+
{
18+
"action": "scrollBottom",
19+
},
20+
],
21+
"networkCapture": [
22+
{
23+
"filterType": "url",
24+
"httpResponseBody": True,
25+
"value": "/api/",
26+
"matchType": "contains",
27+
},
28+
],
29+
},
30+
},
31+
)
32+
33+
def parse(self, response):
34+
for capture in response.raw_api_response["networkCapture"]:
35+
text = b64decode(capture["httpResponseBody"]).decode()
36+
data = json.loads(text)
37+
for quote in data["quotes"]:
38+
yield {
39+
"author": quote["author"]["name"],
40+
"tags": quote["tags"],
41+
"text": quote["text"],
42+
}

0 commit comments

Comments
 (0)