end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

selenium for python + firefoxdriver で、page全体の画面キャプチャを連続取得

selenium for python + chromedriver で、page全体の画面キャプチャを連続取得 - end0tknr's kipple - web写経開発

先程のエントリの別バージョン。

chrome driverを動作させてみると、画面キャプチャ10枚程度で動作が止まってしまう...

原因不明ですが、firefoxdriver を書いてみたところ、上手く動作するみたい。

しかも、先程のように画面キャプチャ用browserの別プロセス化も不要。

こちらの方がよさそ

#!python
# -*- coding: utf-8 -*-

import getopt
import os
import pathvalidate
import re
import sys
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
import time


conf = {"geckodriver_path":
        os.getcwd() + "\\..\\selenium\\geckodriver.exe",
        "img_save_dir" : os.getcwd() }

def main():
    
    if len(sys.argv) == 3:
        req_domain = sys.argv[1]
        req_paths_file = sys.argv[2]
    else:
        print("USAGE:",sys.argv[0],"PROTOCOL_and_FQDN HTML_PATHS_FILE")
        return None
    # ブラウザを headlessモードで起動
    browser = init_browser()

    print( req_paths_file )
    
    req_paths = load_html_paths( req_paths_file )
    browser_tmp = None
    i = 0
    for req_path in req_paths:
        i += 1
        req_url = req_domain + "/" +req_path
        print(i, req_url)
        
        # 大きくしたwindowを後から小さくして、画面キャプチャすると
        # 大きい画像になるようなので、一旦、小さめ?に
        browser.set_window_size(980, 800)

        if http_get_url(browser, req_url, 5) == False:
            print("fail http_get_url()", req_url)
            sys.exit()

        # redirectにより全く別のsiteへ誘導される場合、pass
        result = re.match(req_domain, browser.current_url)
        if result == False:
            print("SKIP ! different host" + browser.current_url)
            continue

        # page全体を画面キャプチャする為、resize
        if set_window_size(browser, 5) == False:
            print("error set_window_size()")
            sys.exit()

        # 画面キャプチャ file保存先の算出
        req_path_tmp = re.sub('/', '_', req_path)
        req_path_tmp = pathvalidate.sanitize_filename(req_path_tmp)
        
        img_save_path = conf["img_save_dir"] + "/" + req_path_tmp + ".png"
        print( " save at " + img_save_path )
        
        # 画面キャプチャ file保存
        if save_screenshot(browser, img_save_path, 5) == False:
            print("error save_screenshot()")
            sys.exit()
        
    browser.close()


def save_screenshot(browser, img_save_path, max_retry):

    for _ in range(max_retry):
        try:
            browser.save_screenshot(img_save_path)
            return True
        except Exception as e:
            print("retry save_screenshot()")
    return False

def set_window_size(browser, max_retry):
    
    for _ in range(max_retry):
        try:
            size_w = \
                browser.execute_script("return document.body.scrollWidth;")
            size_h = \
                browser.execute_script("return document.body.scrollHeight;")
            browser.set_window_size(size_w, size_h)
            return True
        except Exception as e:
            print("retry set_window_size()")
            time.sleep(2)
    return False
            

def http_get_url(browser, req_url, max_retry):
    
    for _ in range(max_retry):
        try:
            browser.get( req_url )
            return True
        except Exception as e:
            print("http_get_url()", req_url)
            time.sleep(2)
    return False
    
    
def init_browser():
    profile = webdriver.FirefoxProfile()
    options = Options()
    options.headless = True

    browser = webdriver.Firefox(executable_path=conf["geckodriver_path"],
                                options=options,
                                firefox_profile=profile,
                                service_log_path=os.path.devnull)
    wait = WebDriverWait(browser, 20)
    browser.implicitly_wait(30)
    return browser


def load_html_paths( html_paths_file ):
    paths = []
    f = open(html_paths_file, mode='r')
    for line in f.readlines():
        paths.append( line.strip() )
    f.close()

    return paths

if __name__ == '__main__':
    main()