end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

selenium for pythonで 国交省の宅建業者等企業情報検索システムから 宅地建物取引業者をparse

全国の宅建業者数が、かなり多い為、時間を要しますが、以下の通りかと思います。

#!python3
# -*- coding: utf-8 -*-

import getopt
import os
import sys
import re
import requests
import time
import urllib.parse

# http://chromedriver.chromium.org/getting-started
from selenium import webdriver # ex. pip install selenium==4.1.3
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

browser_conf = {
    "browser_driver": os.getcwd() + '\\chromedriver.exe',
    "browser_options" : [
        # "headless",
        "enable-logging=False",
        #以下はSSLエラー対策
        "ignore-certificate-errors",
        "disable-extensions",
        "ignore-ssl-errors",
        "disable-print-preview"],
    "implicitly_wait": 10 }


def main():

    url_base = "https://etsuran.mlit.go.jp/TAKKEN/takkenKensaku.do"
    # pref_no = 1
    pref_no = 13
    max_pref_no = 47
    # max_pref_no = 2

    while pref_no <= max_pref_no:
        req_url = url_base + "?dispCount=50&kenCode=%02d" %(pref_no)
        print( "%s" % (req_url), file=sys.stderr )

        browser = init_browser()
        browser.get(req_url)

        search_btn = find_search_btn( browser )
        search_btn.click()
        
        shops_hash = parse_found_shops_pages(browser)

        for pref_licence,shop in shops_hash.items():
            print( pref_licence +"\t"+ shop )

        browser.close()
        pref_no += 1


def parse_found_shops_pages(browser):
    shops_hash = {}
    i = 0
    while(i < 50000 ):
        shops_hash_tmp = parse_shops( browser )
        shops_hash.update( shops_hash_tmp )

        select_elms = browser.find_elements(by=By.CSS_SELECTOR, value="#pageListNo1")
        page_no = Select(select_elms[0]).first_selected_option.text.split("/")

        if i % 10 == 0:
            print( "%s / %s" % (page_no[0],page_no[1]), file=sys.stderr )

        if page_no[0] == page_no[1]:
            break

        next_btn = find_next_btn( browser )
        next_btn.click()
        time.sleep(2)
        i += 1
        
    return shops_hash


def parse_shops( browser ):
    tr_elms = browser.find_elements(by=By.CSS_SELECTOR,value="table.re_disp tr")
    tr_elms.pop(0) # 先頭行はヘッダの為、削除

    re_compile = re.compile("[\((].+[\))]")
    
    shops_tmp = {}
    for tr_elm in tr_elms:
        cols_str = tr_elm.text
        cols = tr_elm.text.split(" ")

        if len(cols) < 3:
            continue

        government = re_compile.sub('',cols[1])
        licence    = re_compile.sub('',cols[2])
        
        replace_strs = ["株式会社","有限会社","合資会社","合同会社",
                        "一般財団法人","公益財団法人"]
        shop = cols[3]
        for replace_str in replace_strs:
            shop = shop.replace(replace_str,"")
            
        shop = shop.strip().strip(" ")
        
        print(government, licence, shop)
        shop_key = government +"\t"+ licence
        shops_tmp[shop_key] = shop

    return shops_tmp
    
def find_search_btn( browser ):
    img_elms = browser.find_elements(by=By.CSS_SELECTOR, value="img")
    for img_elm in img_elms:
        img_src = img_elm.get_attribute("src")
        if img_src == "https://etsuran.mlit.go.jp/TAKKEN/images/btn_search_off.png":
            return img_elm
    return None


def find_next_btn( browser ):
    img_elms = browser.find_elements(by=By.CSS_SELECTOR, value="img")
    for img_elm in img_elms:
        img_src = img_elm.get_attribute("src")
        if img_src == "https://etsuran.mlit.go.jp/TAKKEN/images/result_move_r.jpg":
            return img_elm
    return None


def init_browser():
    browser_service = Service(
        executable_path=browser_conf["browser_driver"] )
    
    browser_opts = Options()
    for tmp_opt in browser_conf["browser_options"]:
        browser_opts.add_argument( tmp_opt )

    browser = webdriver.Chrome(service = browser_service,
                               options = browser_opts )
    # 要素が見つかるまで、最大 ?秒 待つ
    browser.implicitly_wait( browser_conf["implicitly_wait"] )
    return browser

if __name__ == '__main__':
    main()