selenium + chromedriver.exe + python で suumo の 物件詳細取得 - end0tknr's kipple - web写経開発
先程、上記entryを記載しましたが、 selenium の場合、動作があまりに遅い...
今回の場合、画面に対し click 等の操作が不要ですので、 改めて BeautifulSoup for python で記載しました。
#!python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import csv import re import sys import time disp_keys = [ 'base_url','物件名', '販売価格', '所在地','沿線・駅', '間取り','建物面積','専有面積','土地面積' ] conf = { "retry_limit" : 10, "retry_sleep" : 10, } def main(): urls_list_tsv = sys.argv[1] fh = open(urls_list_tsv, 'r', encoding='UTF-8') i = 0 for tsv_line in fh: i += 1 if i % 50 == 0: print(i, file=sys.stderr,flush=True) else: print(".", end="",file=sys.stderr,flush=True) tsv_line = tsv_line.strip() (base_url, pref, result_url) = tsv_line.split("\t") # 新築マンションは価格等が記載されていないことが多い為、無視 if base_url == "https://suumo.jp/ms/shinchiku/": continue bukken_infos = get_bukken_detail(base_url, pref, result_url) for bukken_info in bukken_infos: disp_bukken_info(bukken_info) fh.close() def get_http_requests(result_url): i = 0 while i < conf["retry_limit"]: i += 1 result = None try: result = requests.get(result_url) return result except: print("WARN retry requests.get()", result_url ,file=sys.stderr) time.sleep(conf["retry_sleep"]) print("ERROR requests.get()", result_url ,file=sys.stderr) return None def get_bukken_detail(base_url, pref, result_url): result = get_http_requests(result_url) if not result: return [] soup = BeautifulSoup(result.content, 'html.parser') bukken_divs = soup.select("div.dottable.dottable--cassette") ret_bukken_infos = [] for bukken_div in bukken_divs: bukken_info = {'base_url':base_url} dls = bukken_div.select("dl") for dl in dls: dts = dl.select("dt") dds = dl.select("dd") if len(dts) == 0 or len(dds) == 0: continue bukken_info[ dts[0].text.strip() ] = dds[0].text.strip() ret_bukken_infos.append( bukken_info ) return ret_bukken_infos def disp_bukken_info(bukken_info): disp_cols = [] for disp_key in disp_keys: if disp_key in bukken_info: disp_cols.append( bukken_info[disp_key] ) else: disp_cols.append( "" ) # refer to https://qiita.com/butada/items/33db39ced989c2ebf644 disp_cols_str = "\t".join(disp_cols) disp_cols_str = disp_cols_str.encode('cp932', "ignore") disp_cols_str = disp_cols_str.decode('cp932') print( disp_cols_str ) if __name__ == '__main__': main()