这是我比较完整的解决方案。
apache 日志文件不应包含无效字符或反斜杠。如有必要,您可以使用以下命令从日志文件中删除这些内容:
cat logfile | strings | grep -v '\\' > cleanedlogfile
然后将日志文件复制并解析到postgres中(m[1]到m[7]对应于regexp_matches函数中的正则表达式组):
-- sql for postgres:
drop table if exists rawlog;
create table rawlog (record varchar);
-- import data from log file
copy rawlog from '/path/to/your/apache/cleaned/log/file';
-- parse the rawlog into table accesslog
drop table if exists accesslog;
create table accesslog as
(select m[1] as clientip,
(to_char(to_timestamp(m[4], 'DD/Mon/YYYY:HH24:MI:SS'), 'YYYY-MM-DD HH24:MI:SS ')
|| split_part(m[4], ' ',2))::timestamp with time zone as "time",
split_part(m[5], ' ', 1) as method,
split_part(split_part(m[5], ' ', 2), '?', 1) as uri,
split_part(split_part(m[5], ' ', 2), '?', 2) as query,
m[6]::smallint as status,
m[7]::bigint bytes
from
(select
regexp_matches(record, E'(.*) (.*) (.*) \\[(.*)\\] "(.*)" (\\d+) (\\d+)') as m
from rawlog) s);
-- optionally create indexes
create index accesslogclientipidx on accesslog(clientip);
create index accesslogtimeidx on accesslog(time);
create index accessloguriidx on accesslog(uri);