end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

政府統計 e-stat の データ一覧を BeautifulSoup for python で取得

www.e-stat.go.jp

政府統計 e-stat の データ一覧は、上記urlからダウンロードできるのかもしれませんが、 よく分かりませんでしたので、BeautifulSoup for python で scraping

#!python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import csv
import re
import sys
import time

base_url = "https://www.e-stat.go.jp"
search_path = "/stat-search"
max_page_no = 13

conf = {
    "retry_limit" : 10,
    "retry_sleep" : 10,
}


def main():

    data_list = []
    page_no = 0
    while page_no <= max_page_no:
        page_no+=1
        print("page:",page_no, file=sys.stderr,flush=True)
        
        search_base_url = base_url + search_path
        data_list_tmp = get_data_list(search_base_url, page_no)
        data_list.extend(data_list_tmp)

    disp_data_list(data_list)

def disp_data_list(data_items):
    for data_item in data_items:
        disp_str = \
            "\t".join([
                data_item["id"],
                data_item["name"],
                data_item["url_1"],
                data_item["url_2"]  ])
        print( disp_str )

def get_data_list(search_base_url, page_no):
    
    http_result = get_http_requests(search_base_url, page_no)
    if not http_result:
        return []

    soup = BeautifulSoup(http_result.content, 'html.parser')

    data_lis = soup.select("ul.stat-search_result-list li")
    data_list = []
    
    for data_li in data_lis:
        data_id   = data_li["data-value"]
        
        spans = data_li.select(
            "div.stat-toukei_name_items span.stat-title" )
        data_name = spans[0].text.strip()


        a_hrefs_1  = data_li.select("div.stat-search_result-item1-main a" )
        data_url_1 = base_url + a_hrefs_1[0]["href"]

        a_hrefs_2  = data_li.select("div.stat-pc_detail a" )
        data_url_2 = base_url + a_hrefs_2[0]["href"]

        data_list.append(
            {"id"   :data_id,
             "name" :data_name,
             "url_1":data_url_1,
             "url_2":data_url_2 })

    return data_list

def get_http_requests(search_base_url, page_no):
    result_url = search_base_url + "?page="+ str(page_no)

    i = 0
    while i < conf["retry_limit"]:
        i += 1
        result = None
        try:
            result = requests.get(result_url)
            return result
        except:
            print("WARN retry requests.get()", result_url ,file=sys.stderr)
            time.sleep(conf["retry_sleep"])

    print("ERROR requests.get()", result_url ,file=sys.stderr)
    return None


if __name__ == '__main__':
    main()