end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

BeautifulSoup for python で suumo の 物件詳細取得

selenium + chromedriver.exe + python で suumo の 物件詳細取得 - end0tknr's kipple - web写経開発

先程、上記entryを記載しましたが、 selenium の場合、動作があまりに遅い...

今回の場合、画面に対し click 等の操作が不要ですので、 改めて BeautifulSoup for python で記載しました。

#!python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import csv
import re
import sys
import time

disp_keys = [
    'base_url','物件名', '販売価格', '所在地','沿線・駅',
    '間取り','建物面積','専有面積','土地面積'
]

conf = {
    "retry_limit" : 10,
    "retry_sleep" : 10,
}


def main():
    
    urls_list_tsv = sys.argv[1]
    fh = open(urls_list_tsv, 'r', encoding='UTF-8')

    i = 0
    for tsv_line in fh:
        i += 1
        if i % 50 == 0:
            print(i, file=sys.stderr,flush=True)
        else:
            print(".", end="",file=sys.stderr,flush=True)

        tsv_line = tsv_line.strip()
        (base_url, pref, result_url) = tsv_line.split("\t")

        # 新築マンションは価格等が記載されていないことが多い為、無視
        if base_url == "https://suumo.jp/ms/shinchiku/":
            continue

        bukken_infos = get_bukken_detail(base_url, pref, result_url)
        
        for bukken_info in bukken_infos:
            disp_bukken_info(bukken_info)

    fh.close()


def get_http_requests(result_url):

    i = 0
    while i < conf["retry_limit"]:
        i += 1
        result = None
        try:
            result = requests.get(result_url)
            return result
        except:
            print("WARN retry requests.get()", result_url ,file=sys.stderr)
            time.sleep(conf["retry_sleep"])

    print("ERROR requests.get()", result_url ,file=sys.stderr)
    return None
        

def get_bukken_detail(base_url, pref, result_url):

    result = get_http_requests(result_url)
    if not result:
        return []

    soup = BeautifulSoup(result.content, 'html.parser')

    bukken_divs = soup.select("div.dottable.dottable--cassette")

    ret_bukken_infos = []
    
    for bukken_div in bukken_divs:
        bukken_info = {'base_url':base_url}
        dls = bukken_div.select("dl")
        for dl in dls:
            dts = dl.select("dt")
            dds = dl.select("dd")
            if len(dts) == 0 or len(dds) == 0:
                continue
            bukken_info[ dts[0].text.strip() ] = dds[0].text.strip()

        ret_bukken_infos.append( bukken_info )
    return ret_bukken_infos


def disp_bukken_info(bukken_info):

    disp_cols = []
    for disp_key in disp_keys:
        if disp_key in bukken_info:
            disp_cols.append( bukken_info[disp_key] )
        else:
            disp_cols.append( "" )

    # refer to https://qiita.com/butada/items/33db39ced989c2ebf644
    disp_cols_str = "\t".join(disp_cols)
    disp_cols_str = disp_cols_str.encode('cp932', "ignore")
    disp_cols_str = disp_cols_str.decode('cp932')
    print( disp_cols_str )
    

if __name__ == '__main__':
    main()