apache access_log にあるuser agentからブラウザを判定 - end0tknr's kipple - 新web写経開発
apacheのログ(access_log)解析は、Apache::ParseLog 等のcpan moduleより正規表現 - end0tknr's kipple - 新web写経開発
正規表現と *.gz ファイルの読取りの練習です
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -*- import gzip import re import sys import datetime # apache access_log用 正規表現 re_pat_log_line = \ " ".join(['^([^ ]*) ([^ ]*) ([^ ]*) \[([^]]*)\] "([^ ]*)(?: *([^ ]*)', '*([^ ]*))?" ([^ ]*) ([^ ]*) "(.*?)" "(.*?)"']) re_log_line = re.compile(re_pat_log_line) # access_log 日時 用 正規表現 例:12/Jun/2020:04:27:27 +0900 re_pat_time = '^(\d+)/(\S+)/(\d+):(\d+):(\d+):(\d+)' re_time = re.compile(re_pat_time) # access_log 集計対象外 用 re_pat_ext = '.+\.(js|css|ico|gif|jpg|png)\??.*$' re_ext = re.compile(re_pat_ext) # month str->int month_def = {"Jan":1,"Feb":2,"Mar":3,"Apr": 4,"May": 5,"Jun":6, "Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12} def main(): access_log_gzs = sys.argv access_log_gzs.pop(0) # 引数の先頭は script自身の為、削除 access_summary = {} access_summary_2 = {} for access_log_gz in access_log_gzs: f_in = gzip.open(access_log_gz, 'rt') # gzipをtextとして読取り i = 0 for log_line in f_in.readlines(): log_cols = parse_apache_log_line(log_line) if log_cols == None: continue # 404 や 500 errorは集計対象外 if(log_cols['status'][0:1] == '4' or log_cols['status'][0:1] == '5'): continue # css や js 、 画像は集計対象外 if is_aggregate_target(log_cols['resource']) == False: continue dt_str = log_cols['time'].strftime('%Y-%m') # dt_str = log_cols['time'].strftime('%Y-%m-%d') if (dt_str in access_summary ) == False: access_summary[dt_str] = 0 access_summary[dt_str] += 1 resource = log_cols['resource'] if (dt_str in access_summary_2 ) == False: access_summary_2[dt_str] = {} if (resource in access_summary_2[dt_str] ) == False: access_summary_2[dt_str][resource] = 0 access_summary_2[dt_str][resource] += 1 i += 1 # if i > 5: break # 集計結果を画面表示 for date_str in access_summary_2.keys(): for resource,count in access_summary_2[date_str].items(): print(date_str,resource,count) for date_str,count in access_summary.items(): print(date_str,count) def is_aggregate_target(resource): # login前のtopページ系は対象外 if (resource == '/' or resource[0:2] == '/?' or resource == '/index.html' or resource == '/owner/index.html' or resource == '/owner/login.html'): return False match_result = re_ext.match(resource) if match_result: return False # aws等のmetaデータ取得用(169.254.169.254)等は無視 if (resource[0:7] == 'http://'): return False return True def parse_apache_log_line(log_line): match_result = re_log_line.match(log_line) if match_result == None: return None log_cols = {'host' :match_result.group(1), 'ident' :match_result.group(2), 'user' :match_result.group(3), 'time' :match_result.group(4), 'method' :match_result.group(5), 'resource':match_result.group(6), 'proto' :match_result.group(7), 'status' :match_result.group(8), 'bytes' :match_result.group(9), 'referer' :match_result.group(10), 'agent' :match_result.group(11) } match_result = re_time.match(log_cols["time"] ) if match_result == None: return None month = month_def[match_result.group(2)] log_cols["time"] = datetime.datetime(int(match_result.group(3)), int(month), int(match_result.group(1)), int(match_result.group(4)), int(match_result.group(5)), int(match_result.group(6))) return log_cols if __name__ == '__main__': main()