end0tknr's kipple - 新web写経開発

http://d.hatena.ne.jp/end0tknr/ から移転します

perlのLWP::UserAgent と goolge custom search apiで 郵便番号検索ページにhttp getして住所変換→〒

http://d.hatena.ne.jp/end0tknr/20140302/1393733814
先日の↑エントリの別バージョン。

住所から郵便番号を検索する場合、市町村合併が影響するので、googleのカスタム検索APIの結果から、郵便局の郵便番号検索ページを探すとよさそう。

という訳で↓こう書きました。
(住所変換→〒というより、住所に含まれる全ての〒を展開?します)

#!/usr/local/bin/perl
use strict;
use utf8;
use Encode;
use JSON;
use LWP::UserAgent;
use Data::Dumper;

main( @ARGV );

sub main {
    my ($org_file) = @_;

    open(my $fh,$org_file) or die "can't open file";
    my @org_lines = <$fh>;
    close($fh);

    my $ua = LWP::UserAgent->new();

    $ua->timeout(10);

    for my $shouene_def (@org_lines) {
        $shouene_def = decode('utf8',$shouene_def);

        chomp($shouene_def);
        #都道府県 市町村名 ...
        my @shouene_cols = split(/\t/,$shouene_def);

        if($shouene_cols[5]){
            print encode('utf8',$shouene_def),"\n";
            next;
        }

        my $search_str = '郵便番号 '.$shouene_cols[0];
        if( $shouene_cols[1] =~ /[(\(]旧(.+)[)\)]/o){
            $search_str .= ' '. $1;
        } else {
            $search_str .= ' '. $shouene_cols[1];
        }
        #google custom search api
        my $url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q='
            .url_encode($search_str);

        my $req = HTTP::Request->new(GET=>$url);
        #google custom search api needs referer
        $req->referer('http://d.hatena.ne.jp/end0tknr/');

        my $retry_count = 0;
        my $postal_cgi_urls = [];
        while ($retry_count++ < 20 ){

            my $res = $ua->request($req);
            unless($res->is_success){
                die $res->status_line;
            }

            #郵政省の検索結果ページurlを抽出
            my $res_pl = from_json($res->decoded_content);
            $postal_cgi_urls = extract_japanpost_cgi_url($res_pl);
            if( scalar(@$postal_cgi_urls) >0 ){
                last;
            }
            sleep(3);
            print STDERR "retrying search - $retry_count - ". encode('utf8',$shouene_def),"\n";
        }


        if( scalar(@$postal_cgi_urls) ==0 ){
            print STDERR
                "can't  find japanpost cgi url ".
                    encode('utf8',$shouene_def),"\n";
            print encode('utf8', join("\t",@shouene_cols)),"\n";
        }

        for my $postal_cgi_url ( @$postal_cgi_urls ){
            my $req_2 = HTTP::Request->new(GET=>$postal_cgi_url);
            my $res_2 = $ua->request($req_2);
            unless($res_2->is_success){
                die $res_2->status_line;
            }
            #郵政省の検索結果ページから郵便番号を抽出
            my $zipcodes = extract_zipcodes($res_2->decoded_content);
            for my $zipcode ( @$zipcodes ){
                print encode('utf8', join("\t",@shouene_cols,@$zipcode)),"\n";
            }
        }

        sleep(2);
    }
}

sub extract_zipcodes {
    my ($html_src) = @_;

    $html_src =~ s/\s+/ /go;

    my $regepr =
        '<td class=\"data\">(\d+\-\d+)<\/td>'.
        ' <td class=\"data\">([^\x00-\x7f]*?)<\/td>'.   #市町村
        ' <td>[\x00-\x7f]+([^\x00-\x7f]+)[\x00-\x7f]+'; #町
    my @ret;
    while($html_src =~ /$regepr/go ){
        push(@ret, [$1,$2,$3]);
    }
    return \@ret;
}


sub url_encode {
    my ($str) = @_;
    $str = encode('utf8',$str);
    $str =~ s/(\W)/'%' . unpack('H2', $1)/ego;
    return $str;
}


sub extract_japanpost_cgi_url {
    my ($res_pl) = @_;

    my $ret_urls = {};

    for my $result ( @{$res_pl->{responseData}->{results}} ) {
        if($result->{unescapedUrl} =~
           /www\.post\.japanpost\.jp\/cgi\-zip\/zipcode.php\?pref=\d+\&city=\d+$/o)
            {
                my $ret_url = $result->{unescapedUrl} .'&cmp=1&mode=list&addr=';
                $ret_url =~ s/^https/http/o;
                $ret_urls->{$ret_url} = 1;
            }
        if($result->{unescapedUrl} =~
           /www\.post\.japanpost\.jp\/cgi\-zip\/zipcode.php\?pref=\d+\&city=\d+.*\&mode=list/o)
            {
                my $ret_url = $result->{unescapedUrl};
                $ret_url =~ s/^https/http/o;
                $ret_urls->{$ret_url} = 1;
            }
    }

    return [sort keys %$ret_urls];
}

sub extract_addr {
    my ($zipcode,$html_src) = @_;

    $html_src =~ s/\s+/ /go;
    my $regepr =
        "<td class=\"data\"><small>($zipcode)<\/small><\/td>".
        " <td class=\"data\"><small>([^\x00-\x7f]*?)<\/small><\/td>". #都道府県
        " <td class=\"data\"><small>([^\x00-\x7f]*?)<\/small><\/td>". #市町村
        " <td>[\x00-\x7f]+([^\x00-\x7f]+)[\x00-\x7f]+";               #町

   if($html_src =~ /$regepr/ ){
        return [$2,$3,$4];
    }
    return [];
}