selenium for python + chromedriver で、page全体の画面キャプチャを連続取得 - end0tknr's kipple - web写経開発
先程のエントリの別バージョン。
chrome driverを動作させてみると、画面キャプチャ10枚程度で動作が止まってしまう...
原因不明ですが、firefoxdriver を書いてみたところ、上手く動作するみたい。
しかも、先程のように画面キャプチャ用browserの別プロセス化も不要。
こちらの方がよさそ
#!python # -*- coding: utf-8 -*- import getopt import os import pathvalidate import re import sys from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.firefox.options import Options import time conf = {"geckodriver_path": os.getcwd() + "\\..\\selenium\\geckodriver.exe", "img_save_dir" : os.getcwd() } def main(): if len(sys.argv) == 3: req_domain = sys.argv[1] req_paths_file = sys.argv[2] else: print("USAGE:",sys.argv[0],"PROTOCOL_and_FQDN HTML_PATHS_FILE") return None # ブラウザを headlessモードで起動 browser = init_browser() print( req_paths_file ) req_paths = load_html_paths( req_paths_file ) browser_tmp = None i = 0 for req_path in req_paths: i += 1 req_url = req_domain + "/" +req_path print(i, req_url) # 大きくしたwindowを後から小さくして、画面キャプチャすると # 大きい画像になるようなので、一旦、小さめ?に browser.set_window_size(980, 800) if http_get_url(browser, req_url, 5) == False: print("fail http_get_url()", req_url) sys.exit() # redirectにより全く別のsiteへ誘導される場合、pass result = re.match(req_domain, browser.current_url) if result == False: print("SKIP ! different host" + browser.current_url) continue # page全体を画面キャプチャする為、resize if set_window_size(browser, 5) == False: print("error set_window_size()") sys.exit() # 画面キャプチャ file保存先の算出 req_path_tmp = re.sub('/', '_', req_path) req_path_tmp = pathvalidate.sanitize_filename(req_path_tmp) img_save_path = conf["img_save_dir"] + "/" + req_path_tmp + ".png" print( " save at " + img_save_path ) # 画面キャプチャ file保存 if save_screenshot(browser, img_save_path, 5) == False: print("error save_screenshot()") sys.exit() browser.close() def save_screenshot(browser, img_save_path, max_retry): for _ in range(max_retry): try: browser.save_screenshot(img_save_path) return True except Exception as e: print("retry save_screenshot()") return False def set_window_size(browser, max_retry): for _ in range(max_retry): try: size_w = \ browser.execute_script("return document.body.scrollWidth;") size_h = \ browser.execute_script("return document.body.scrollHeight;") browser.set_window_size(size_w, size_h) return True except Exception as e: print("retry set_window_size()") time.sleep(2) return False def http_get_url(browser, req_url, max_retry): for _ in range(max_retry): try: browser.get( req_url ) return True except Exception as e: print("http_get_url()", req_url) time.sleep(2) return False def init_browser(): profile = webdriver.FirefoxProfile() options = Options() options.headless = True browser = webdriver.Firefox(executable_path=conf["geckodriver_path"], options=options, firefox_profile=profile, service_log_path=os.path.devnull) wait = WebDriverWait(browser, 20) browser.implicitly_wait(30) return browser def load_html_paths( html_paths_file ): paths = [] f = open(html_paths_file, mode='r') for line in f.readlines(): paths.append( line.strip() ) f.close() return paths if __name__ == '__main__': main()