python で apache access_log (gzip形式)をparse

apache access_log にあるuser agentからブラウザを判定 - end0tknr's kipple - 新web写経開発

apacheのログ(access_log)解析は、Apache::ParseLog 等のcpan moduleより正規表現 - end0tknr's kipple - 新web写経開発

perlで書いた上記エントリを、pythonで書いてみた。

正規表現と *.gz ファイルの読取りの練習です

#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*-
import gzip
import re
import sys
import datetime

# apache access_log用 正規表現
re_pat_log_line = \
    " ".join(['^([^ ]*) ([^ ]*) ([^ ]*) \[([^]]*)\] "([^ ]*)(?: *([^ ]*)',
              '*([^ ]*))?" ([^ ]*) ([^ ]*) "(.*?)" "(.*?)"'])
re_log_line = re.compile(re_pat_log_line)

# access_log 日時 用 正規表現 例:12/Jun/2020:04:27:27 +0900
re_pat_time = '^(\d+)/(\S+)/(\d+):(\d+):(\d+):(\d+)'
re_time = re.compile(re_pat_time)

# access_log 集計対象外 用
re_pat_ext = '.+\.(js|css|ico|gif|jpg|png)\??.*$'
re_ext = re.compile(re_pat_ext)

# month str->int
month_def = {"Jan":1,"Feb":2,"Mar":3,"Apr": 4,"May": 5,"Jun":6,
             "Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}


def main():
    access_log_gzs = sys.argv
    access_log_gzs.pop(0) # 引数の先頭は script自身の為、削除

    access_summary = {}
    access_summary_2 = {}
    
    for access_log_gz in access_log_gzs:
        f_in = gzip.open(access_log_gz, 'rt') # gzipをtextとして読取り

        i = 0
        for log_line in f_in.readlines():
            log_cols = parse_apache_log_line(log_line)
            
            if log_cols == None: continue

            # 404 や 500 errorは集計対象外
            if(log_cols['status'][0:1] == '4' or
               log_cols['status'][0:1] == '5'):
                continue
            # css や js 、 画像は集計対象外
            if is_aggregate_target(log_cols['resource']) == False:
                continue

           
            dt_str = log_cols['time'].strftime('%Y-%m')
#            dt_str = log_cols['time'].strftime('%Y-%m-%d')
            if (dt_str in access_summary ) == False:
                access_summary[dt_str] = 0
            access_summary[dt_str] += 1

            resource = log_cols['resource']
            
            if (dt_str in access_summary_2 ) == False:
                access_summary_2[dt_str] = {}
            if (resource in access_summary_2[dt_str] ) == False:
                access_summary_2[dt_str][resource] = 0
            access_summary_2[dt_str][resource] += 1
                
            
            i += 1
#            if i > 5: break


    # 集計結果を画面表示
    for date_str in access_summary_2.keys():
        for resource,count in access_summary_2[date_str].items():
            print(date_str,resource,count)

    for date_str,count in access_summary.items():
        print(date_str,count)

            
def is_aggregate_target(resource):
    # login前のtopページ系は対象外
    if (resource == '/' or
        resource[0:2] == '/?' or
        resource == '/index.html' or
        resource == '/owner/index.html' or
        resource == '/owner/login.html'):
        return False

    match_result = re_ext.match(resource)
    if match_result:
        return False

    # aws等のmetaデータ取得用(169.254.169.254)等は無視
    if (resource[0:7] == 'http://'):
        return False
    
    return True


def parse_apache_log_line(log_line):
    match_result = re_log_line.match(log_line)
    if match_result == None:
        return None
            
    log_cols = {'host'    :match_result.group(1),
                'ident'   :match_result.group(2),
                'user'    :match_result.group(3),
                'time'    :match_result.group(4),
                'method'  :match_result.group(5),
                'resource':match_result.group(6),
                'proto'   :match_result.group(7),
                'status'  :match_result.group(8),
                'bytes'   :match_result.group(9),
                'referer' :match_result.group(10),
                'agent'   :match_result.group(11) }
    
    match_result = re_time.match(log_cols["time"] )
    if match_result == None:
        return None

    month = month_def[match_result.group(2)]

    log_cols["time"] = datetime.datetime(int(match_result.group(3)),
                                         int(month),
                                         int(match_result.group(1)),
                                         int(match_result.group(4)),
                                         int(match_result.group(5)),
                                         int(match_result.group(6)))
    return log_cols

    
if __name__ == '__main__':
    main()

end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ