When using chromium engine in a spider to render a page that cannot be rendered with webkit engine with scrapy crawl example
and spider:
import scrapy
from scrapy_splash import SplashRequest
class ExampleSpider(scrapy.Spider):
name = 'example'
url = 'http://example.com/'
def start_requests(self):
yield SplashRequest(
url=self.url,
callback=self.parse,
endpoint='render.html', # have gone with and without specifying
headers=None, # have gone with and without specifying
args={
'wait': 0.5,
'engine': 'chromium',
# 'headers': None, # also tried this
},
)
def parse(self, response, **kwargs):
yield None
However, I get:
2022-11-02 19:56:00 [scrapy.core.scraper] ERROR: Error downloading <GET http://example.com/ via http://localhost:8050/render.html>
Traceback (most recent call last):
File "/path/venv/lib/python3.10/site-packages/twisted/internet/defer.py", line 1696, in _inlineCallbacks
result = context.run(gen.send, result)
File "/path/venv/lib/python3.10/site-packages/scrapy/core/downloader/middleware.py", line 60, in process_response
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
File "/path/venv/lib/python3.10/site-packages/scrapy_splash/middleware.py", line 412, in process_response
response = self._change_response_class(request, response)
File "/path/venv/lib/python3.10/site-packages/scrapy_splash/middleware.py", line 433, in _change_response_class
response = response.replace(cls=respcls, request=request)
File "/path/venv/lib/python3.10/site-packages/scrapy/http/response/__init__.py", line 117, in replace
return cls(*args, **kwargs)
File "/path/venv/lib/python3.10/site-packages/scrapy_splash/response.py", line 119, in __init__
self._load_from_json()
File "/path/venv/lib/python3.10/site-packages/scrapy_splash/response.py", line 165, in _load_from_json
error = self.data['info']['error']
TypeError: string indices must be integers
The root of this issue is found looking through the Scrapy logs:
2022-11-02 23:56:00.000802 [-] Unhandled error in Deferred:
2022-11-02 23:56:00.000931 [-] Unhandled Error
Traceback (most recent call last):
File "/app/splash/pool.py", line 47, in render
self.queue.put(slot)
File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 1872, in put
self.waiting.pop(0).callback(obj)
File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 460, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 568, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/app/splash/pool.py", line 76, in _start_render
render.start(**slot_args.kwargs)
File "/app/splash/engines/chromium/render_scripts.py", line 59, in start
raise BadOption("headers is not implemented")
splash.errors.BadOption: headers is not implemented
This doesn't seem to be a problem when using Scrapy in browser:
or with curl:
versions
Python 3.10.7
Scrapy 2.7.0
scrapy-splash 0.8.0
Splash 3.5.0