#!/usr/bin/env perl # # accesslog-aggregate # # takes an access_log, prepends/summarizes it with some statistics # # $Id$ use warnings; use strict; use Getopt::Std; use Time::Piece; # for strptime, also used for strftime #use URI::Escape; # but conditionally: my $have_uri_escape = eval { require URI::Escape; URI::Escape->import(); 1; }; my %opt; getopts( 'ad123f:FhHm:oOpPrst:uvw?', \%opt ); if ( scalar( grep { defined $opt{$_} } qw(m p r) ) > 1 ) { die "options -m, -p and -r are exclusive, please specify at most one of them\n"; } my $default_format = '%h %l %u %t \"%r\" %>s %b'; if ( !defined( $opt{f} ) ) { $opt{f} = defined( $ENV{ACCESSLOG_AGGREGATE_FORMAT} ) ? $ENV{ACCESSLOG_AGGREGATE_FORMAT} : $default_format; } HELP_MESSAGE() if $opt{h} || $opt{'?'}; sub HELP_MESSAGE { print STDERR <{$status}->{$domain} ->{$match}->{$secslate}, $ftime2status2domain2match2late2nr_bytes{$ftime}->{$status}->{$domain} ->{$match}->{$secslate}, $status, $ftime, $secslate, $domain, $match, $line ), "\n"; } sub strip_final_components # if $d < 0, strip off that many components # if $d > 0, leave at most that many components { my ( $d, $h ) = @_; my @comps = split( /\./, $h ); my @remains = splice( @comps, 0, -$d ); my $nr_stripped = scalar(@comps); #join('.', @remains, split(//, '*' x $nr_stripped)); # works, but is confusing join( '.', @remains, !@remains ? ('*') : $nr_stripped ? ('') : () ); } sub strip_components { my ( $d, $h ) = @_; $h !~ /[^\d.]/ ? strip_final_components( $d, $h ) : scalar reverse( strip_final_components( $d, scalar reverse($h) ) ); } sub log_line_parser # see http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats # # a nice and small CPAN module, Apache::LogRegex, exists to do this, # but it's nonstandard and I don't want to have to set up local::lib, # so here is an alternative implementation { # to parse the LogFormat specification string in $opt{f}, # slightly leniently in case of future extensions (beyond Apache 2.4): # the terms all start with %, then either end in % # or optionally have a {}-enclosed part # that (I assume) cannot contain }, followed by # optionally ^ or >, followed by one or more alphabet characters my @format = split( m#(%(?:%|(?:{[^}]*})?[^a-z\s\\]?[a-z]+))#i, $opt{f} ); # the % terms are the odd elements; if any are given, the 0th term is empty #warn 'tokenized format: [', join('] [', @format), "]\n"; # now generate a parser that scans the line using the format specifiers # and puts the values into a hash; the keys of the hash are the # format specifier including the %, and in case of duplicate specifications # only the *first* value is returned; e.g. if the format is '%s %>s %s', # the hash will have keys '%s' with the first value and '%>s' # like Apache::LogRegex, we do *not* do a validity check on the values, # we only do a simple scan guaranteed to capture the value if valid; # we can't guarantee this in the general case, as some specifiers # such as %{VARNAME}e, allow *arbitrary* values, rendering the format # fundamentally unparsable: we can't tell where in '%{A}e%B}e' # the first value ends and the second one starts; in practice, # constant string parts (the nonempty even values in @format) # are used to delimit such values so we'll assume they don't contain # the closing delimiter; so use a regular expression that scans # nongreedily, and use more specific subexpressions for nonarbitrary # value types; we need to use heuristics here, so the following may change # #ehm( 'log format specification elements:', map { "'$_'" } @format ); my @rx = map { $_ eq '' ? '' # if empty, don't include anything for it in the regex : !/^%/ ? "\Q$_\E" # if nonempty constant string, match it literally : $_ eq '%%' ? '%' # match % literally : $_ eq '%t' ? '(\[.+?\])' # []-delimited timestamp : $_ eq '%r' ? '(.*?)' # first line of request: anything : $_ eq '%u' ? '(\S+?|\S.*\S?)' # authenticated user - can be Lucid Chairman : /T$/ ? '(\S+?)' # time taken without {} is non-whitespace : /}[pP]$/ ? '(\d+?)' # port number and process ID are integers : /^%\{.*}/ ? '(.*?)' # all other %{...}...: anything (not really) : '(\S+?)'; # everything else (including %U and %q): non-whitespace } @format; #ehm( 'log format parsing regex elements:', @rx ); if ( $opt{F} ) { # in addition to the above, also accept a hyphen in every (named) field @rx = map { /^(.*)\)$/ ? $1 . '|-)' : $_ } @rx; } my $rx = join( '', '^', @rx, '$' ); ehm( 'log format parsing regex:', $rx ); $rx = qr($rx); #ehm( 'log format parsing regex:', $rx ); my @keys = map { $format[$_] } grep { $_ % 2 } 0 .. $#format; # the odd values of @format if ( grep { defined( $opt{$_} ) } qw(oOt) && !grep { $_ eq '%t' } @keys ) { die "the log format lacks %t, so the options -t, -o and -O cannot be used\n"; } sub { my @values = $_[0] =~ /$rx/; if ( scalar(@values) != scalar(@keys) ) { if ( $opt{w} ) { warn "input line does not match format $opt{f} at $ARGV:$.: $_[0]"; } return; } my %res = map { $keys[$_] => $values[$_] } reverse( 0 .. $#values ); # iterate over @values, which is empty in case parsing failed; # use reverse() to pick the last one in case of duplicates \%res; } } my $parse_log_line = log_line_parser(); my $secslatest_time; sub or_else { defined( $_[0] ) ? $_[0] : $_[1] # // isn't defined on all our hosts } sub add { $_[0] = defined( $_[0] ) ? $_[0] + $_[1] : $_[1] # // isn't defined on all our hosts } sub formatted_time { my ($time) = @_; if ( !$opt{t} || !defined($time) ) # should be equivalent { '*'; } elsif ( my $ftime = gmtime($time)->strftime( $opt{t} ) ) { $ftime; } else { '?'; } } while (<>) { my $l = &$parse_log_line($_); if ( !defined($l) ) { if ( $opt{w} ) { next; } else { last; } } #ehm('parsed line has', scalar(keys %$l), 'values'); ehm( join( ', ', map { "$_ => " . ( defined( $l->{$_} ) ? $l->{$_} : '-' ) } keys %$l ) ); if ( $opt{u} && !$have_uri_escape ) { die "fatal error: cannot use -u, the Perl module URI::Escape is not installed\n"; } my $status = $opt{s} ? or_else( $l->{'%>s'}, '*' ) : '*'; my $time; my $secslate = '*'; if ( $opt{t} || $opt{o} || $opt{O} ) { if ( eval { $time = Time::Piece->strptime( $l->{'%t'}, '[%d/%b/%Y:%H:%M:%S %z]' ); } ) { if ( $opt{o} || $opt{O} ) { my $epoch = $time->epoch; if ( !defined($secslatest_time) || $epoch >= 0 && $epoch >= $secslatest_time ) { $secslatest_time = $epoch; $secslate = 0; } else { $secslate = $opt{o} ? ( $secslatest_time - $epoch ) : '+'; } } } else { warn "ignoring unparseable input timestamp $l->{'%t'} in: $_"; undef $time; } } my ($domain) = $opt{H} ? $l->{'%h'} : $opt{3} ? strip_components( 3, $l->{'%h'} ) : $opt{2} ? strip_components( 2, $l->{'%h'} ) : $opt{1} ? strip_components( 1, $l->{'%h'} ) : $opt{d} ? strip_components( -1, $l->{'%h'} ) : (); $domain = '*' if !defined($domain); my $match; if ( defined( $l->{'%r'} ) ) { my ( $method, $path, $version ) = $l->{'%r'} =~ /$request_rx/; if ( !defined($path) ) { warn "unparseable request $l->{'%r'} in: $_"; $match = '?'; } elsif ( $path eq '' ) { $match = '-'; } elsif ( $path =~ /$match_rx/ ) { $match = $1 // $& // '?'; } else { $match = '?'; } } else { warn "cannot find request in log line\n"; $match = '??'; } if ( $opt{u} ) { $match = uri_unescape($match); } my $ftime = formatted_time($time); ++$ftime2status2domain2match2late2nr_requests{$ftime}->{$status}->{$domain} ->{$match}->{$secslate}; my $nr_bytes = or_else( $l->{'%b'}, 0 ); $nr_bytes = 0 if $nr_bytes eq '-'; add( $ftime2status2domain2match2late2nr_bytes{$ftime}->{$status}->{$domain} ->{$match}->{$secslate}, $nr_bytes ); if ( $opt{a} ) { chomp; printf_totals( $ftime, $status, $domain, $match, $secslate, $_ ); } } if ( !$opt{a} ) { foreach my $ftime ( sort keys %ftime2status2domain2match2late2nr_requests ) { foreach my $status ( sort keys %{ $ftime2status2domain2match2late2nr_requests{$ftime} } ) { foreach my $domain ( sort keys %{ $ftime2status2domain2match2late2nr_requests{$ftime}->{$status} } ) { foreach my $match ( sort keys %{ $ftime2status2domain2match2late2nr_requests{$ftime}->{$status} ->{$domain} } ) { foreach my $secslate ( sort keys %{ $ftime2status2domain2match2late2nr_requests{$ftime}->{$status} ->{$domain}->{$match} } ) { printf_totals( $ftime, $status, $domain, $match, $secslate, '*' ); } } } } } }