end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

selenium + chromedriver.exe + python で suumo の 物件詳細取得

先程の上記entryの続きです。先程の python script で取得した 検索結果一覧のurlに記載されている物件詳細を取得します。

#!python3
# -*- coding: utf-8 -*-

# http://chromedriver.chromium.org/getting-started
from selenium import webdriver
from selenium.webdriver.support.ui     import WebDriverWait
import os
import re
import sys
import time

chrome_conf = {
    "chrome_driver": os.getcwd() + '\\chromedriver.exe',
    "chrome_options" : [
        "--headless",
        "--enable-logging=False",
        #以下はSSLエラー対策らしい
        "--ignore-certificate-errors",
        "--disable-extensions",
        "--ignore-ssl-errors",
        "--disable-print-preview"],
    "implicitly_wait": 10 }

disp_keys = [
    'base_url','物件名', '販売価格', '所在地','沿線・駅', '間取り','建物面積','専有面積','土地面積'
]


def main():

    urls_list_tsv = sys.argv[1]
    fh = open(urls_list_tsv, 'r', encoding='UTF-8')
    browser = init_browser()

    i = 0
    for tsv_line in fh:
        i += 1
        print(i, file=sys.stderr)

        tsv_line = tsv_line.strip()
        (base_url, pref, result_url) = tsv_line.split("\t")

        # 新築マンションは価格等が記載されていないことが多い為、無視
        if base_url == "https://suumo.jp/ms/shinchiku/":
            continue

        bukken_infos = get_bukken_detail(browser, base_url, pref, result_url)
        
        for bukken_info in bukken_infos:
            disp_bukken_info(bukken_info)

        if i % 20 == 0:
            browser.close()
            browser = init_browser()

    fh.close()
    browser.close()

    

def disp_bukken_info(bukken_info):

    disp_cols = []
    for disp_key in disp_keys:
        if disp_key in bukken_info:
            disp_cols.append( bukken_info[disp_key] )
        else:
            disp_cols.append( "" )

    print( "\t".join(disp_cols) )
    

def get_bukken_detail(browser, base_url, pref, result_url):
    # print(pref, result_url, file=sys.stderr)
    
    browser.get( result_url )

    bukken_divs = \
        browser.find_elements_by_css_selector(
            "div.dottable.dottable--cassette")
    ret_bukken_infos = []
    
    for bukken_div in bukken_divs:
        bukken_info = {'base_url':base_url}
        dls = bukken_div.find_elements_by_css_selector("dl")
        for dl in dls:
            dts = dl.find_elements_by_css_selector("dt")
            dds = dl.find_elements_by_css_selector("dd")
            if len(dts) == 0 or len(dds) == 0:
                continue

            bukken_info[ dts[0].text ] = dds[0].text

        ret_bukken_infos.append( bukken_info )
    return ret_bukken_infos
            
    
def init_browser():
    chopt = webdriver.ChromeOptions()
    
    for option_tmp in chrome_conf["chrome_options"]:
        chopt.add_argument( option_tmp )

    browser = webdriver.Chrome(options = chopt,
                               executable_path=chrome_conf["chrome_driver"])
    # 要素が見つかるまで、最大 ?秒 待つ
    browser.implicitly_wait( chrome_conf["implicitly_wait"] )
    return browser

if __name__ == '__main__':
    main()