- NCSA HTTPd (Apache HTTP Server 前身) 定义的一个标准 Web 服务器日志格式。
- 格式:
host ident authuser date request status bytes
例如:127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
- 如果哪一个字段没有值,就用
-
代替。
import re
from datetime import datetime
RE_CLF = re.compile(r'(\S+) (\S+) (\S+) \[(.*?)\] "(.*?)" (\d{3}) (\d+|-)')
def parse_clf(log_line):
match = RE_CLF.match(log_line)
if not match:
raise ValueError('Log line does not match CLF format')
ip_address = match.group(1)
identity = match.group(2)
user = match.group(3)
time_str = match.group(4)
request_line = match.group(5)
status_code = int(match.group(6))
size = match.group(7)
time_format = '%d/%b/%Y:%H:%M:%S %z'
timestamp = datetime.strptime(time_str, time_format)
size = int(size) if size != '-' else None
return {
'host': ip_address,
'ident': identity,
'authuser': user,
'date': timestamp,
'request': request_line,
'status': status_code,
'bytes': size,
}
log_example = '127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] 'GET /apache_pb.gif HTTP/1.0' 200 2326'
parsed_log = parse_clf(log_example)
print(parsed_log)
# {'host': '127.0.0.1', 'ident': 'user-identifier', 'authuser': 'frank',
# 'date': datetime.datetime(2000, 10, 10, 13, 55, 36, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
# 'request': 'GET /apache_pb.gif HTTP/1.0', 'status': 200, 'bytes': 2326}