import re
import requests
import sqlite3
db_path = "saawo.sqlite"
cgi_url = "https://etsuran2.mlit.go.jp/TAKKEN/takkenKensaku.do"
per_page = 50
cgi_params = {"CMD":"selectPage",
"caller":"TK",
"choice":"1",
"rdoSelect":"1",
"rdoSelectJoken":"1",
"rdoSelectSort":"1",
"sortValue":"1",
"dispCount":per_page,
"licenseNoKbn":"%02d",
"pageCount":"%d",
"pageListNo1":"%d",
"pageListNo2":"%d"}
license_nos = {
-1:"信託会社",0:"国交大臣",
1:"北海道",2:"青森",3:"岩手",4:"宮城",5:"秋田",6:"山形",7:"福島",
8:"茨城",9:"栃木",10:"群馬",11:"埼玉",12:"千葉",13:"東京都",14:"神奈川",
15:"新潟",16:"富山",17:"石川",18:"福井",19:"山梨",20:"長野",21:"岐阜",
22:"静岡",23:"愛知",24:"三重",25:"滋賀",26:"京都",27:"大阪",28:"兵庫",
29:"奈良",30:"和歌山",31:"鳥取",32:"島根",33:"岡山",34:"広島",35:"山口",
36:"徳島",37:"香川",38:"愛媛",39:"高知",40:"福岡",41:"佐賀",42:"長崎",
43:"熊本",44:"大分",45:"宮崎",46:"鹿児島",47:"沖縄",}
ddl_sql = """\
CREATE TABLE IF NOT EXISTS real_estate_shop (
government varchar(16), -- 例; 国土交通大臣、東京都、信託会社
licence varchar(16),
shop varchar(64),
PRIMARY KEY(government,licence) ) """
def main():
db_cur = init_db()
shops = []
for license_no, license_name in license_nos.items():
pno = 1
total_size = pno * per_page
while pno * per_page <= total_size:
shops, total_size = \
get_real_estate_shops( license_no, license_name, pno )
upsert_real_estate_shops( db_cur, shops )
db_cur.connection.commit()
pno += 1
db_cur.connection.close()
def upsert_real_estate_shops( db_cur, shops ):
sql = '''INSERT INTO real_estate_shop (government, licence, shop )
VALUES (?, ?, ?)
ON CONFLICT (government,licence)
DO UPDATE SET shop=excluded.shop
'''
db_cur.executemany(sql, shops )
def get_real_estate_shops(license_no,license_name, pno=1):
cgi_params["licenseNoKbn"] = "%02d" % (license_no)
cgi_params["pageCount"] = pno
cgi_params["pageListNo1"] =pno
cgi_params["pageListNo2"] =pno
req = requests.Request('GET', cgi_url, params=cgi_params)
prepared = req.prepare()
print( prepared.url )
with requests.Session() as session:
response = session.send(prepared)
res_txts = response.content.decode('cp932')
res_txts = res_txts.replace("\r","").replace("\n","")
re_size = re.compile("検索結果:(\d+)件")
re_result = re_size.search( res_txts )
total_size = int(re_result.group(1))
re_str = \
'<td .*?title="licenseNo">.*?第(\d+)号</td>.*?'+ \
'onclick="js_ShowDetail\(\'.+?\'\)">(.+?)</a>'
re_pat_shops = re.compile( re_str )
re_result = re_pat_shops.findall( res_txts )
shops = []
for match in re_pat_shops.finditer( res_txts ):
shop_name = match.group(2).replace("株式会社","")
shop_name = shop_name.strip()
shops.append([license_name, match.group(1), shop_name ])
return shops, total_size
def init_db():
db_con = sqlite3.connect( db_path )
db_cur = db_con.cursor()
db_cur.execute( ddl_sql )
return db_cur
if __name__ == '__main__':
main()