BeautifulSoup for python3 による scraping の練習 - end0tknr's kipple - web写経開発
と、書いてはみたものの、selenium + chromedriver.exe でも同様のことは 実現できますので、suumo の 戸建/マンション の 新築/中古の一覧を表示する urlを一覧化
#!python3 # -*- coding: utf-8 -*- # http://chromedriver.chromium.org/getting-started from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait import os import re import sys import time chrome_conf = { "chrome_driver": os.getcwd() + '\\chromedriver.exe', "chrome_options" : [#"--headless", "--enable-logging=False", #以下、3行はSSLエラー対策らしい "--ignore-certificate-errors", "--disable-extensions", "--disable-print-preview"], "implicitly_wait": 10 } pref_names = [ "hokkaido", "aomori","iwate","miyagi","akita","yamagata", "fukushima","ibaraki","tochigi", "gumma", # suumo では、gunma でなく gumma "saitama","chiba","tokyo","kanagawa", "niigata","toyama","ishikawa","fukui","yamanashi","nagano","gifu", "shizuoka","aichi","mie","shiga","kyoto","osaka","hyogo","nara", "wakayama","tottori","shimane","okayama","hiroshima","yamaguchi", "tokushima","kagawa","ehime","kochi","fukuoka","saga","nagasaki", "kumamoto","oita","miyazaki", "kagoshima" ] base_urls = [ "https://suumo.jp/ikkodate/", #新築戸建 "https://suumo.jp/chukoikkodate/", #中古戸建 "https://suumo.jp/ms/shinchiku/", #新築マンション "https://suumo.jp/ms/chuko/", #中古マンション ] def main(): for base_url in base_urls: browser = init_browser() for pref_name in pref_names: #「hokkaido_」のように「_」が付加されている為 if pref_name == "hokkaido" and \ base_url in ("https://suumo.jp/ikkodate/", "https://suumo.jp/chukoikkodate/", "https://suumo.jp/ms/chuko/"): pref_name += "_" #他の都道府県のurl構成が異なる為、無視(skip) elif pref_name == "hokkaido" and \ base_url == "https://suumo.jp/ms/shinchiku/": continue search_result_urls = \ get_search_result_urls(browser, base_url, pref_name) for search_result_url in search_result_urls: print("\t".join([base_url,pref_name,search_result_url])) browser.close() def get_search_result_urls(browser, base_url, pref_name): func_name = sys._getframe().f_code.co_name print(func_name, base_url, pref_name, file=sys.stderr) req_url = base_url + pref_name +"/city/" browser.get( req_url ) # 検索ボタン click submit_btns = \ browser.find_elements_by_css_selector( ".ui-btn--search" ) if len(submit_btns) == 0: print("ERROR find_elements_by_css_selector() ",req_url, file=sys.stderr) sys.exit() return [] submit_btns[0].click() time.sleep(2) paginations = [] paginations.extend( browser.find_elements_by_css_selector( ".pagination.pagination_set-nav ol li") ) paginations.extend( browser.find_elements_by_css_selector( ".sortbox_pagination ol li") ) ret_urls = [browser.current_url] if len(paginations) == 0: return ret_urls for pno in range( 1, int(paginations[-1].text) ): ret_urls.append("%s&pn=%d" % (browser.current_url, pno+1) ) return ret_urls def init_browser(): chopt = webdriver.ChromeOptions() for option_tmp in chrome_conf["chrome_options"]: chopt.add_argument( option_tmp ) browser = webdriver.Chrome(options = chopt, executable_path=chrome_conf["chrome_driver"]) # 要素が見つかるまで、最大 ?秒 待つ browser.implicitly_wait( chrome_conf["implicitly_wait"] ) return browser if __name__ == '__main__': main()