end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

selenium for python3 による google画像検索結果からの画像url抽出

以下の通りかと思います

#!/usr/local/bin/python3
# -*- coding: utf-8 -*-

from selenium import webdriver # ex. pip install selenium==4.0.0a7
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by      import By
from selenium.webdriver.support.ui     import Select
import re
import sys
import time
import urllib.parse

conf = {
    "selenium" : {
        "browser_driver": "./chromedriver",
#        "browser_driver": "/usr/local/bin/chromedriver",
        "browser_options" : [
#            "--headless",
            "--enable-logging=False",
            "--ignore-certificate-errors",
            "--disable-extensions",
            "--disable-print-preview",
            "--download.default_directory=/tmp"
        ],
        "implicitly_wait": 10 }
}

search_url = "https://www.google.co.jp/search"
search_str = urllib.parse.quote("戸建て住宅 間取り図面")
search_params = [
    "q="  + search_str,
    "tbm=isch",
    "hl=ja",
    "sclient=img",
    "ei=7yL2Ypn2MtuM1e8PutqksA0",
    "tbs=itp:clipart",
    "sa=X"]
re_compile = re.compile("\?imgurl=(.[^&]+)")
# 収集する最大url数.
# google検索での最大件数がこの程度でしたので
max_url_size = 900


def main():
    browser = get_browser()

    req_url = search_url +"?" + "&".join(search_params)
    print( req_url )
    browser.get( req_url )
    time.sleep(3)
    
    img_urls = []
    while len(img_urls) <= max_url_size:
        print( len(img_urls), file=sys.stderr )
        img_urls = extract_img_urls( browser, img_urls )
        
    for img_url in img_urls:
        print( img_url )
        

def extract_img_urls( browser, img_urls ):
    a_elms = browser.find_elements(by=By.CSS_SELECTOR,
                                   value="a.wXeWr")
    i = len( img_urls )
    while i < len( a_elms ):
        a_elm = a_elms[i]
        
        try:
            # clickすることで、auto pagerize します
            a_elm.click()
            href_url =  a_elm.get_attribute("href")
        except Exception as e:
            i += 1
            time.sleep( 1 )
            continue
        
        re_result = re_compile.search( href_url )
        
        if not re_result:
            i += 1
            continue
        
        img_url = re_result.group(1)
        img_url = urllib.parse.unquote( img_url )
        img_urls.append( img_url )
        
        i += 1
        time.sleep( 1 )
        
    return img_urls

# selenium を使用する場合、browser(driver)を返します
def get_browser():
    selenium_conf = conf["selenium"]
    browser_service = \
        Service( executable_path=selenium_conf["browser_driver"] )

    browser_opts = Options()
    for tmp_opt in selenium_conf["browser_options"]:
        browser_opts.add_argument( tmp_opt )

    browser = webdriver.Edge(service = browser_service,
                             options = browser_opts )
    # 要素が見つかるまで、最大 ?秒 待つ
    browser.implicitly_wait( selenium_conf["implicitly_wait"] )

    # 以下は、headless modeでもdownloadする為のもの。
    # refer to https://qiita.com/memakura/items/f80d2e2c59514cfc14c9
    browser.command_executor._commands["send_command"] = (
        "POST",
        '/session/$sessionId/chromium/send_command' )
    params = {'cmd': 'Page.setDownloadBehavior',
              'params': {'behavior': 'allow',
                         'downloadPath': '/tmp' } }
    browser.execute("send_command", params=params)

    return browser



if __name__ == '__main__':
    main()