http://d.hatena.ne.jp/end0tknr/20140302/1393733814
先日の↑エントリの別バージョン。
住所から郵便番号を検索する場合、市町村合併が影響するので、googleのカスタム検索APIの結果から、郵便局の郵便番号検索ページを探すとよさそう。
という訳で↓こう書きました。
(住所変換→〒というより、住所に含まれる全ての〒を展開?します)
#!/usr/local/bin/perl use strict; use utf8; use Encode; use JSON; use LWP::UserAgent; use Data::Dumper; main( @ARGV ); sub main { my ($org_file) = @_; open(my $fh,$org_file) or die "can't open file"; my @org_lines = <$fh>; close($fh); my $ua = LWP::UserAgent->new(); $ua->timeout(10); for my $shouene_def (@org_lines) { $shouene_def = decode('utf8',$shouene_def); chomp($shouene_def); #都道府県 市町村名 ... my @shouene_cols = split(/\t/,$shouene_def); if($shouene_cols[5]){ print encode('utf8',$shouene_def),"\n"; next; } my $search_str = '郵便番号 '.$shouene_cols[0]; if( $shouene_cols[1] =~ /[(\(]旧(.+)[)\)]/o){ $search_str .= ' '. $1; } else { $search_str .= ' '. $shouene_cols[1]; } #google custom search api my $url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=' .url_encode($search_str); my $req = HTTP::Request->new(GET=>$url); #google custom search api needs referer $req->referer('http://d.hatena.ne.jp/end0tknr/'); my $retry_count = 0; my $postal_cgi_urls = []; while ($retry_count++ < 20 ){ my $res = $ua->request($req); unless($res->is_success){ die $res->status_line; } #郵政省の検索結果ページurlを抽出 my $res_pl = from_json($res->decoded_content); $postal_cgi_urls = extract_japanpost_cgi_url($res_pl); if( scalar(@$postal_cgi_urls) >0 ){ last; } sleep(3); print STDERR "retrying search - $retry_count - ". encode('utf8',$shouene_def),"\n"; } if( scalar(@$postal_cgi_urls) ==0 ){ print STDERR "can't find japanpost cgi url ". encode('utf8',$shouene_def),"\n"; print encode('utf8', join("\t",@shouene_cols)),"\n"; } for my $postal_cgi_url ( @$postal_cgi_urls ){ my $req_2 = HTTP::Request->new(GET=>$postal_cgi_url); my $res_2 = $ua->request($req_2); unless($res_2->is_success){ die $res_2->status_line; } #郵政省の検索結果ページから郵便番号を抽出 my $zipcodes = extract_zipcodes($res_2->decoded_content); for my $zipcode ( @$zipcodes ){ print encode('utf8', join("\t",@shouene_cols,@$zipcode)),"\n"; } } sleep(2); } } sub extract_zipcodes { my ($html_src) = @_; $html_src =~ s/\s+/ /go; my $regepr = '<td class=\"data\">(\d+\-\d+)<\/td>'. ' <td class=\"data\">([^\x00-\x7f]*?)<\/td>'. #市町村 ' <td>[\x00-\x7f]+([^\x00-\x7f]+)[\x00-\x7f]+'; #町 my @ret; while($html_src =~ /$regepr/go ){ push(@ret, [$1,$2,$3]); } return \@ret; } sub url_encode { my ($str) = @_; $str = encode('utf8',$str); $str =~ s/(\W)/'%' . unpack('H2', $1)/ego; return $str; } sub extract_japanpost_cgi_url { my ($res_pl) = @_; my $ret_urls = {}; for my $result ( @{$res_pl->{responseData}->{results}} ) { if($result->{unescapedUrl} =~ /www\.post\.japanpost\.jp\/cgi\-zip\/zipcode.php\?pref=\d+\&city=\d+$/o) { my $ret_url = $result->{unescapedUrl} .'&cmp=1&mode=list&addr='; $ret_url =~ s/^https/http/o; $ret_urls->{$ret_url} = 1; } if($result->{unescapedUrl} =~ /www\.post\.japanpost\.jp\/cgi\-zip\/zipcode.php\?pref=\d+\&city=\d+.*\&mode=list/o) { my $ret_url = $result->{unescapedUrl}; $ret_url =~ s/^https/http/o; $ret_urls->{$ret_url} = 1; } } return [sort keys %$ret_urls]; } sub extract_addr { my ($zipcode,$html_src) = @_; $html_src =~ s/\s+/ /go; my $regepr = "<td class=\"data\"><small>($zipcode)<\/small><\/td>". " <td class=\"data\"><small>([^\x00-\x7f]*?)<\/small><\/td>". #都道府県 " <td class=\"data\"><small>([^\x00-\x7f]*?)<\/small><\/td>". #市町村 " <td>[\x00-\x7f]+([^\x00-\x7f]+)[\x00-\x7f]+"; #町 if($html_src =~ /$regepr/ ){ return [$2,$3,$4]; } return []; }