end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

国交省の宅建業者等企業情報検索システムにある不動産会社をpythonで http getし、sqliteへ保存

import re
import requests
import sqlite3

db_path  = "saawo.sqlite"
cgi_url  = "https://etsuran2.mlit.go.jp/TAKKEN/takkenKensaku.do"
per_page = 50
cgi_params = {"CMD":"selectPage",
              "caller":"TK",
              "choice":"1",
              "rdoSelect":"1",
              "rdoSelectJoken":"1",
              "rdoSelectSort":"1",
              "sortValue":"1",
              "dispCount":per_page,
              "licenseNoKbn":"%02d",
              "pageCount":"%d",
              "pageListNo1":"%d",
              "pageListNo2":"%d"}

license_nos = {
    -1:"信託会社",0:"国交大臣",
    1:"北海道",2:"青森",3:"岩手",4:"宮城",5:"秋田",6:"山形",7:"福島",
    8:"茨城",9:"栃木",10:"群馬",11:"埼玉",12:"千葉",13:"東京都",14:"神奈川",
    15:"新潟",16:"富山",17:"石川",18:"福井",19:"山梨",20:"長野",21:"岐阜",
    22:"静岡",23:"愛知",24:"三重",25:"滋賀",26:"京都",27:"大阪",28:"兵庫",
    29:"奈良",30:"和歌山",31:"鳥取",32:"島根",33:"岡山",34:"広島",35:"山口",
    36:"徳島",37:"香川",38:"愛媛",39:"高知",40:"福岡",41:"佐賀",42:"長崎",
    43:"熊本",44:"大分",45:"宮崎",46:"鹿児島",47:"沖縄",}


ddl_sql = """\
CREATE TABLE IF NOT EXISTS real_estate_shop (
government              varchar(16),  -- 例; 国土交通大臣、東京都、信託会社
licence                 varchar(16),
shop                    varchar(64),
PRIMARY KEY(government,licence) )  """


def main():
    db_cur = init_db()

    shops = []
    for license_no,  license_name in license_nos.items():
        pno = 1
        total_size = pno * per_page # p.1の場合、dummy
        while pno * per_page <= total_size:
            shops, total_size = \
                get_real_estate_shops( license_no, license_name, pno )
            upsert_real_estate_shops( db_cur, shops )
            db_cur.connection.commit()
            pno += 1
        
    db_cur.connection.close()


def upsert_real_estate_shops( db_cur, shops ):
    sql = '''INSERT INTO real_estate_shop (government, licence, shop )
    VALUES (?, ?, ?)
    ON CONFLICT (government,licence)
    DO UPDATE SET shop=excluded.shop
 '''
    db_cur.executemany(sql, shops )

    
def get_real_estate_shops(license_no,license_name, pno=1):
    
    cgi_params["licenseNoKbn"] = "%02d" % (license_no)
    cgi_params["pageCount"]  = pno
    cgi_params["pageListNo1"] =pno
    cgi_params["pageListNo2"] =pno

    req = requests.Request('GET', cgi_url, params=cgi_params)
    prepared = req.prepare()
    print( prepared.url )

    with requests.Session() as session:
        response = session.send(prepared)
        
    res_txts = response.content.decode('cp932')
    res_txts = res_txts.replace("\r","").replace("\n","")

    re_size = re.compile("検索結果:(\d+)件")
    re_result = re_size.search( res_txts )
    total_size = int(re_result.group(1))

    re_str = \
        '<td .*?title="licenseNo">.*?第(\d+)号</td>.*?'+ \
        'onclick="js_ShowDetail\(\'.+?\'\)">(.+?)</a>'

    re_pat_shops = re.compile( re_str )
    re_result = re_pat_shops.findall( res_txts )
    shops = []
    for match in re_pat_shops.finditer( res_txts ):
        shop_name = match.group(2).replace("株式会社","")
        shop_name = shop_name.strip()
        shops.append([license_name, match.group(1), shop_name ])
    return shops, total_size

def init_db():
    db_con = sqlite3.connect( db_path )
    db_cur = db_con.cursor()

    db_cur.execute( ddl_sql )
    return db_cur


if __name__ == '__main__':
    main()