end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

BeautifulSoup for python3 による scraping の練習

機械学習を使って東京23区のお買い得賃貸物件を探してみた 〜スクレイピング編〜 - データで見る世界

↑こちらを写経し、↓こう書きました

#!python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import csv
import re
import sys
import time

# 検索結果 1ページ目
req_url_1st = \
    "http://suumo.jp/jj/chintai/ichiran/FR301FC001/?" + \
    "&".join(["ar=030",
              "bs=040",
              "ta=13",
              "sc=13121",
              "cb=0.0",
              "ct=9999999",
              "et=9999999",
              "cn=9999999",
              "mb=0",
              "mt=9999999",
              "shkr1=03",
              "shkr2=03",
              "shkr3=03",
              "shkr4=03",
              "fw2=",
              "srch_navi=1"])

def main():
    
    urls = get_req_urls()

    i = 0
    bukkens = []
    for url in urls:
        i+=1
        print( "%d / %d" % (i, len(urls) ) )
        bukkens.extend( get_bukken_list(url) )
        time.sleep(2)
        
        # if i > 5:
        #     break
        
    save_to_csv(bukkens)


def save_to_csv(bukkens):
    f = open('bukkens.csv', 'w', newline='')
    writer = csv.writer(f)
    
    header = ["建物名","住所","築年数","高さ","立地1","立地2","立地3",
              "階数","家賃","管理費","敷金","礼金","間取","面積"]
    writer.writerow(header)

    for bkn in bukkens:
        body = [
            bkn["subtitle"],
            bkn["subaddress"],
            bkn["age"],
            bkn["height"],
            bkn["locations0"],
            bkn["locations1"],
            bkn["locations2"],
            bkn["floor"],
            bkn["rent"],
            bkn["admin"],
            bkn["shiki_kin"],
            bkn["rei_kin"],
            bkn["floor_plan"],
            bkn["area"] ]
        writer.writerow( body )

    writer.writerow(header)
    f.close()

    
        
def get_bukken_list(url):

    result = requests.get(url)

    soup = BeautifulSoup(result.content, 'html.parser')
    summary = soup.find("div",{'id':'js-bukkenList'})
    
    cassetteitems = summary.find_all("div",{'class':'cassetteitem'})

    re_substr = re.compile('<.+>(.+)</.+>')

    ret_bukkens = []

    for cassetteitem in cassetteitems:
        
        #建物名
        subtitle = cassetteitem.find_all(
            "div",{'class':'cassetteitem_content-title'} )
        subtitle = str( subtitle[0] )
        result = re_substr.search( subtitle )
        subtitle = result.group(1)
        
        #住所
        subaddress = cassetteitem.find_all(
            "li",{'class':'cassetteitem_detail-col1'})
        subaddress = str( subaddress[0] )
        result = re_substr.search( subaddress )
        subaddress = result.group(1)

        #築年数と建物高さ
        col3 = cassetteitem.find_all(
            "li",{'class':'cassetteitem_detail-col3'})
        cols = col3[0].find_all('div')
        if len(cols) == 1:
            age = cols[0].find(text=True)
            height = ""
        if len(cols) == 2:
            age    = cols[0].find(text=True)
            height = cols[1].find(text=True)
            
        #立地 (最大3コ取得)
        sublocations = cassetteitem.find_all(
            "li",{'class':'cassetteitem_detail-col2'})

        cols = sublocations[0].find_all('div')
        if len(cols) == 1:
            locations0 = cols[0].find(text=True)
            locations1 = ""
            locations2 = ""
        elif len(cols) == 2:
            locations0 = cols[0].find(text=True)
            locations1 = cols[1].find(text=True)
            locations2 = ""
        elif len(cols) == 3:
            locations0 = cols[0].find(text=True)
            locations1 = cols[1].find(text=True)
            locations2 = cols[2].find(text=True)

        #階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積 table
        rows = []
        re_text = re.compile('.+')
        
        for table in summary.find_all('table'):
            for tr in table.find_all('tr'):
                cols = tr.find_all('td')
                if len(cols) == 0:
                    continue
                # 階数
                floor = cols[2].find(text=True).strip()
                # 家賃, 管理費
                (rent,admin) = cols[3].find_all( text=re_text )
                # 敷金, 礼金
                (shiki_kin,rei_kin) = cols[4].find_all( text=re_text )[:2]
                # 間取, 面積
                (floor_plan, area) = cols[5].find_all( text=re_text )[:2]

                ret_bukkens.append(
                    {"subtitle"  :subtitle,     #建物名
                     "subaddress":subaddress,   #住所
                     "age"       :age,          #築年数
                     "height"    :height,       #高さ
                     "locations0":locations0,   #立地
                     "locations1":locations1,
                     "locations2":locations2,
                     "floor"     :floor,        #階数
                     "rent"      :rent,         #家賃
                     "admin"     :admin,        #管理費
                     "shiki_kin" :shiki_kin,    #敷金
                     "rei_kin"   :rei_kin,      #礼金
                     "floor_plan":floor_plan,   #間取
                     "area"      :area          #面積
                    })

    return ret_bukkens

                
                
                        
def get_req_urls():
    url = req_url_1st

    result = requests.get(url)
    
    soup = BeautifulSoup(result.content, 'html.parser')
    
    #物件リストの部分切り出し
    summary = soup.find("div",{'id':'js-bukkenList'})

    #ページ数を取得
    pages = soup.find("body").find_all("div",
                                       {'class':'pagination pagination_set-nav'})
    pages_split = str(pages).split('</a></li>\n</ol>')
    pages_split0 = pages_split[0]
    pages_split1 = pages_split0[-3:]
    pages_split2 = pages_split1.replace('>','')
    pages_split3 = int(pages_split2)

    ret_urls = [url]

    for i in range(pages_split3-1):
        pg = str(i+2)
        url_page = url + '&pn=' + pg
        ret_urls.append(url_page)
        
    return ret_urls


    

if __name__ == '__main__':
    main()