possibly related to #175
pyppeteer is losing connection to chromium browser (maybe because of something that happens inside chromium?). However, when we run the same logic on node.js, we get no erors.
I have attached a script to replicate the issue (apologies, it's not cleaned up - a discovery code afaik)
Here is the output of the script:
[I:pyppeteer.launcher] Browser listening on: ws://127.0.0.1:51578/devtools/browser/a2c27856-5b78-490b-a2a4-cce6c03f14fb
ws://127.0.0.1:51578/devtools/browser/a2c27856-5b78-490b-a2a4-cce6c03f14fb
1 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4726N/abstract 55657
2 https://ui.adsabs.harvard.edu/#abs/2019NewA...66...20P/abstract 56716
3 https://ui.adsabs.harvard.edu/#abs/2019NewA...66...40N/abstract 53318
4 https://ui.adsabs.harvard.edu/#abs/2019JMoSt1177..418K/abstract 57338
5 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4372C/abstract 56445
6 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4726N/abstract 56319
7 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5167S/abstract 58452
8 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5459M/abstract 58754
9 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5567M/abstract 59967
10 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..458B/abstract 69867
11 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..529C/abstract 67922
12 https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483L..47C/abstract 54674
13 https://ui.adsabs.harvard.edu/#abs/2019NewA...67....1N/abstract 62676
14 https://ui.adsabs.harvard.edu/#abs/2019NewA...68...51M/abstract 55099
15 https://ui.adsabs.harvard.edu/#abs/2019PhyE..107....5B/abstract 54913
[I:pyppeteer.connection] connection closed
DISCONNECTED
Protocol Error (Runtime.callFunctionOn): Session closed. Most likely the page has been closed.
Traceback (most recent call last):
File "client.py", line 53, in main
}''')
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/page.py", line 1158, in evaluate
return await frame.evaluate(pageFunction, *args, force_expr=force_expr)
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/frame_manager.py", line 295, in evaluate
pageFunction, *args, force_expr=force_expr)
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/execution_context.py", line 55, in evaluate
pageFunction, *args, force_expr=force_expr)
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/execution_context.py", line 109, in evaluateHandle
_rewriteError(e)
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/execution_context.py", line 239, in _rewriteError
raise error
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/execution_context.py", line 106, in evaluateHandle
'userGesture': True,
File "/dvt/workspace2/ADSTurboBee/python3.7/lib/python3.7/site-packages/pyppeteer/connection.py", line 218, in send
f'Protocol Error ({method}): Session closed. Most likely the '
pyppeteer.errors.NetworkError: Protocol Error (Runtime.callFunctionOn): Session closed. Most likely the page has been closed.
Fatal error https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5553J/abstract
Reconnected ws://127.0.0.1:51578/devtools/browser/a2c27856-5b78-490b-a2a4-cce6c03f14fb
[<pyppeteer.page.Page object at 0x7fe4a9784390>, <pyppeteer.page.Page object at 0x7fe4a9784588>]
9 out of 10 times, I get the error after 15 urls were loaded. This seems to suggest some sort of a timeout (underlaying websocket?)
I tried running with verbose logs, but can't see anything out of ordinary (around the time the DISCONNECT event happens)
import time
import asyncio
import signal
from pyppeteer import launch, connect
from functools import partial
import traceback
import logging
import sys
pyppeteer_level = logging.INFO
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
urls = ['https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4726N/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4726N/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...66...20P/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...66...31M/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...66...40N/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019AcSpA.209..264M/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019JMoSt1177..418K/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4364C/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4372C/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4422C/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4726N/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.4985A/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5167S/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5349R/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5459M/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5553J/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5567M/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..392D/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..458B/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..711A/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..529C/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483..840B/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483L..47C/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.483L..64D/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...67....1N/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...67...45V/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019NewA...68...51M/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019IJAEO..75...15B/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019PhyE..107....5B/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019SurSc.681...32E/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019CNSNS..70...89J/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019CNSNS..70..223O/abstract', 'https://ui.adsabs.harvard.edu/#abs/2015AIPC.1672m0003H/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.1858Y/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019Icar..319....1A/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.1786L/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.1733G/abstract', 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.5018E/abstract']
def signal_handler(signal, frame):
while len(urls):
urls.pop()
signal.signal(signal.SIGINT, signal_handler)
loop = asyncio.get_event_loop()
async def get_browser():
# two errs, 16 batch: '--disable-dev-shm-usage'
# identical to previous: '--shm-size=1gb'
# identical: no args
# '--no-sandbox', '--disable-setuid-sandbox' : no diff
b = await launch(options={'headless': True, 'waitUntil': ['load', 'domcontentloaded', 'networkidle0'], 'args': ['--enable-popup-blocking']})
b.on('disconnected', lambda: sys.stderr.write("DISCONNECTED\n"))
return b
async def main():
browser = await get_browser()
print (browser.wsEndpoint)
wsEndpoint = browser.wsEndpoint
errc = 0
while len(urls):
try:
page = await browser.newPage()
page.once('error', lambda: print('ERROR'))
page.once('pageerror', lambda: print('PAGEERROR'))
page.once('requestfailed', lambda: print('REQUESTFAILED'))
i = 0
for u in urls:
await page.goto(u, options={'waitUntil':['load', 'domcontentloaded', 'networkidle0'], 'timeout': 30000})
await asyncio.sleep(1)
content = await page.evaluate('''() => {
return {
html: document.documentElement.outerHTML
}
}''')
i+=1
print(i, u, len(content['html']))
bibc = u.split('/')[-2]
fo = open(bibc + '.html', 'w')
html = content['html']
fo.write(html.replace('<meta charset="utf-8">', '<meta charset="utf-8"><base href="https://ui.adsabs.harvard.edu/" />'))
fo.close()
# results in: NetworkError: Protocol Error (Page.navigate): Session closed. Most likely the page has been closed.
# on next page.goto
# await page.close()
urls.pop(0)
except Exception as e:
await asyncio.sleep(1)
errc += 1
print (e)
print(traceback.format_exc())
print ('Fatal error', urls.pop(0))
browser = await connect(options={'browserWSEndpoint': wsEndpoint})
print('Reconnected ' + browser.wsEndpoint)
#await browser.close()
print(await browser.pages())
await asyncio.sleep(1)
#browser = await launch(headless = False, waitUntil='networkidle2', args=['--disable-dev-shm-usage'])
browser = await get_browser()
await browser.close()
print ('num_errors={}'.format(errc))
loop.run_until_complete(main())