end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

selenium for python + chromedriver で、page全体の画面キャプチャを連続取得

【Python】Seleniumでページ全体のスクリーンショット撮るならマルチプロセスで! - Qiita

上記に倣い、以下のように書くと、よさそう。

ポイントは、

  • 一旦、対象のurlへアクセス後、window sizeを取得
  • その後、別processの webdriverをwindow size指定で起動し、画面キャプチャ取得

ちなみに、以下は、python for win で動作させています

#!/usr/local/bin/python
# -*- coding: utf-8 -*-

import getopt
import os
import re
import sys
# http://chromedriver.chromium.org/getting-started
from selenium import webdriver
import time
import urllib
import urllib.request
import urllib.parse


conf = {"img_save_dir" : os.getcwd() + "\\IMG_TMP\\",
        "chrome_driver": os.getcwd() + '\\chromedriver.exe',
        "chrome_options" : ["--headless",
                            "--enable-logging=False",
                            #以下、3行はSSLエラー対策らしい
                            "--ignore-certificate-errors",
                            "--disable-extensions",
                            "--disable-print-preview"]}

def main():
    
    if len(sys.argv) == 3:
        req_domain = sys.argv[1]
        req_paths_file = sys.argv[2]
    else:
        print("USAGE:",sys.argv[0],"PROTOCOL_and_FQDN HTML_PATHS_FILE")
        return None

    browser = init_browser()
    
    req_paths = load_html_paths( req_paths_file )
    i = 0
    for req_path in req_paths:
        i += 1
        req_url = req_domain + req_path
        print(i, req_url)
        
        # 必要に応じ、以下を comment or un-comment
        req_url = req_url.replace('.cgi.html','.cgi')

        # 一旦、対象のurlへアクセス後、window sizeを取得し
        # 別processの webdriverで画面キャプチャ取得
        browser.get( req_url )
        size_w = browser.execute_script("return document.body.scrollWidth;")
        size_h = browser.execute_script("return document.body.scrollHeight;")
        # browser.set_window_size(1024,h)

        # 画面キャプチャ file保存先の算出
        req_path_tmp = re.sub('^/', '', req_path)
        req_path_tmp = re.sub('/', '_', req_path_tmp)
        img_save_path = conf["img_save_dir"] + req_path_tmp + ".png"

        browser_tmp = init_browser_for_screenshot(size_w, size_h)
        
        browser_tmp.get( req_url )
        browser_tmp.save_screenshot(img_save_path)
        browser_tmp.close()
        
#        if i > 5:
#            break

    browser.close()


def init_browser():
    chopt = webdriver.ChromeOptions()
    
    for option_tmp in conf["chrome_options"]:
        chopt.add_argument( option_tmp )

    browser = webdriver.Chrome(options = chopt,
                               executable_path=conf["chrome_driver"])
    return browser

def init_browser_for_screenshot(size_w,size_h):
    chopt = webdriver.ChromeOptions()

    for option_tmp in conf["chrome_options"]:
        chopt.add_argument( option_tmp )
    chopt.add_argument("--window-size="+str(size_w)+","+str(size_h))

    browser = webdriver.Chrome(options = chopt,
                               executable_path=conf["chrome_driver"])
    return browser


def load_html_paths( html_paths_file ):

    paths = []
    f = open(html_paths_file, mode='r',encoding='utf-8')
    for line in f:
        pref_city = line.split()
        paths.append( line.strip() )
    f.close()

    return paths

    

if __name__ == '__main__':
    main()