先程の上記entryの続きです。先程の python script で取得した 検索結果一覧のurlに記載されている物件詳細を取得します。
#!python3 # -*- coding: utf-8 -*- # http://chromedriver.chromium.org/getting-started from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait import os import re import sys import time chrome_conf = { "chrome_driver": os.getcwd() + '\\chromedriver.exe', "chrome_options" : [ "--headless", "--enable-logging=False", #以下はSSLエラー対策らしい "--ignore-certificate-errors", "--disable-extensions", "--ignore-ssl-errors", "--disable-print-preview"], "implicitly_wait": 10 } disp_keys = [ 'base_url','物件名', '販売価格', '所在地','沿線・駅', '間取り','建物面積','専有面積','土地面積' ] def main(): urls_list_tsv = sys.argv[1] fh = open(urls_list_tsv, 'r', encoding='UTF-8') browser = init_browser() i = 0 for tsv_line in fh: i += 1 print(i, file=sys.stderr) tsv_line = tsv_line.strip() (base_url, pref, result_url) = tsv_line.split("\t") # 新築マンションは価格等が記載されていないことが多い為、無視 if base_url == "https://suumo.jp/ms/shinchiku/": continue bukken_infos = get_bukken_detail(browser, base_url, pref, result_url) for bukken_info in bukken_infos: disp_bukken_info(bukken_info) if i % 20 == 0: browser.close() browser = init_browser() fh.close() browser.close() def disp_bukken_info(bukken_info): disp_cols = [] for disp_key in disp_keys: if disp_key in bukken_info: disp_cols.append( bukken_info[disp_key] ) else: disp_cols.append( "" ) print( "\t".join(disp_cols) ) def get_bukken_detail(browser, base_url, pref, result_url): # print(pref, result_url, file=sys.stderr) browser.get( result_url ) bukken_divs = \ browser.find_elements_by_css_selector( "div.dottable.dottable--cassette") ret_bukken_infos = [] for bukken_div in bukken_divs: bukken_info = {'base_url':base_url} dls = bukken_div.find_elements_by_css_selector("dl") for dl in dls: dts = dl.find_elements_by_css_selector("dt") dds = dl.find_elements_by_css_selector("dd") if len(dts) == 0 or len(dds) == 0: continue bukken_info[ dts[0].text ] = dds[0].text ret_bukken_infos.append( bukken_info ) return ret_bukken_infos def init_browser(): chopt = webdriver.ChromeOptions() for option_tmp in chrome_conf["chrome_options"]: chopt.add_argument( option_tmp ) browser = webdriver.Chrome(options = chopt, executable_path=chrome_conf["chrome_driver"]) # 要素が見つかるまで、最大 ?秒 待つ browser.implicitly_wait( chrome_conf["implicitly_wait"] ) return browser if __name__ == '__main__': main()