機械学習を使って東京23区のお買い得賃貸物件を探してみた 〜スクレイピング編〜 - データで見る世界
↑こちらを写経し、↓こう書きました
#!python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import csv import re import sys import time # 検索結果 1ページ目 req_url_1st = \ "http://suumo.jp/jj/chintai/ichiran/FR301FC001/?" + \ "&".join(["ar=030", "bs=040", "ta=13", "sc=13121", "cb=0.0", "ct=9999999", "et=9999999", "cn=9999999", "mb=0", "mt=9999999", "shkr1=03", "shkr2=03", "shkr3=03", "shkr4=03", "fw2=", "srch_navi=1"]) def main(): urls = get_req_urls() i = 0 bukkens = [] for url in urls: i+=1 print( "%d / %d" % (i, len(urls) ) ) bukkens.extend( get_bukken_list(url) ) time.sleep(2) # if i > 5: # break save_to_csv(bukkens) def save_to_csv(bukkens): f = open('bukkens.csv', 'w', newline='') writer = csv.writer(f) header = ["建物名","住所","築年数","高さ","立地1","立地2","立地3", "階数","家賃","管理費","敷金","礼金","間取","面積"] writer.writerow(header) for bkn in bukkens: body = [ bkn["subtitle"], bkn["subaddress"], bkn["age"], bkn["height"], bkn["locations0"], bkn["locations1"], bkn["locations2"], bkn["floor"], bkn["rent"], bkn["admin"], bkn["shiki_kin"], bkn["rei_kin"], bkn["floor_plan"], bkn["area"] ] writer.writerow( body ) writer.writerow(header) f.close() def get_bukken_list(url): result = requests.get(url) soup = BeautifulSoup(result.content, 'html.parser') summary = soup.find("div",{'id':'js-bukkenList'}) cassetteitems = summary.find_all("div",{'class':'cassetteitem'}) re_substr = re.compile('<.+>(.+)</.+>') ret_bukkens = [] for cassetteitem in cassetteitems: #建物名 subtitle = cassetteitem.find_all( "div",{'class':'cassetteitem_content-title'} ) subtitle = str( subtitle[0] ) result = re_substr.search( subtitle ) subtitle = result.group(1) #住所 subaddress = cassetteitem.find_all( "li",{'class':'cassetteitem_detail-col1'}) subaddress = str( subaddress[0] ) result = re_substr.search( subaddress ) subaddress = result.group(1) #築年数と建物高さ col3 = cassetteitem.find_all( "li",{'class':'cassetteitem_detail-col3'}) cols = col3[0].find_all('div') if len(cols) == 1: age = cols[0].find(text=True) height = "" if len(cols) == 2: age = cols[0].find(text=True) height = cols[1].find(text=True) #立地 (最大3コ取得) sublocations = cassetteitem.find_all( "li",{'class':'cassetteitem_detail-col2'}) cols = sublocations[0].find_all('div') if len(cols) == 1: locations0 = cols[0].find(text=True) locations1 = "" locations2 = "" elif len(cols) == 2: locations0 = cols[0].find(text=True) locations1 = cols[1].find(text=True) locations2 = "" elif len(cols) == 3: locations0 = cols[0].find(text=True) locations1 = cols[1].find(text=True) locations2 = cols[2].find(text=True) #階、賃料、管理費、敷/礼/保証/敷引,償却、間取り、専有面積 table rows = [] re_text = re.compile('.+') for table in summary.find_all('table'): for tr in table.find_all('tr'): cols = tr.find_all('td') if len(cols) == 0: continue # 階数 floor = cols[2].find(text=True).strip() # 家賃, 管理費 (rent,admin) = cols[3].find_all( text=re_text ) # 敷金, 礼金 (shiki_kin,rei_kin) = cols[4].find_all( text=re_text )[:2] # 間取, 面積 (floor_plan, area) = cols[5].find_all( text=re_text )[:2] ret_bukkens.append( {"subtitle" :subtitle, #建物名 "subaddress":subaddress, #住所 "age" :age, #築年数 "height" :height, #高さ "locations0":locations0, #立地 "locations1":locations1, "locations2":locations2, "floor" :floor, #階数 "rent" :rent, #家賃 "admin" :admin, #管理費 "shiki_kin" :shiki_kin, #敷金 "rei_kin" :rei_kin, #礼金 "floor_plan":floor_plan, #間取 "area" :area #面積 }) return ret_bukkens def get_req_urls(): url = req_url_1st result = requests.get(url) soup = BeautifulSoup(result.content, 'html.parser') #物件リストの部分切り出し summary = soup.find("div",{'id':'js-bukkenList'}) #ページ数を取得 pages = soup.find("body").find_all("div", {'class':'pagination pagination_set-nav'}) pages_split = str(pages).split('</a></li>\n</ol>') pages_split0 = pages_split[0] pages_split1 = pages_split0[-3:] pages_split2 = pages_split1.replace('>','') pages_split3 = int(pages_split2) ret_urls = [url] for i in range(pages_split3-1): pg = str(i+2) url_page = url + '&pn=' + pg ret_urls.append(url_page) return ret_urls if __name__ == '__main__': main()