#!/usr/bin/env perl
#
# accesslog-aggregate
#
# takes an access_log, prepends/summarizes it with some statistics
#
# $Id$

use warnings;
use strict;

use Getopt::Std;
use Time::Piece;    # for strptime, also used for strftime

#use URI::Escape;  # but conditionally:
my $have_uri_escape = eval {
  require URI::Escape;
  URI::Escape->import();
  1;
};

my %opt;
getopts( 'ad123f:FhHm:oOpPrst:uvw?', \%opt );

if ( scalar( grep { defined $opt{$_} } qw(m p r) ) > 1 )
{
  die
   "options -m, -p and -r are exclusive, please specify at most one of them\n";
}

my $default_format = '%h %l %u %t \"%r\" %>s %b';

if ( !defined( $opt{f} ) )
{
  $opt{f} =
   defined( $ENV{ACCESSLOG_AGGREGATE_FORMAT} )
   ? $ENV{ACCESSLOG_AGGREGATE_FORMAT}
   : $default_format;
}

HELP_MESSAGE() if $opt{h} || $opt{'?'};

sub HELP_MESSAGE
{
  print STDERR <<ZZ;

Usage: $0 [-a] [-d|-1|-2|-3|-H] [-o|-O] [-f FORMAT] [-F] [-p|-r|-m regex] [-s] [-t FORMAT] [access_log1] [...]

to summarize an Apache access log
supplied as filename arguments or on standard input.

Output:

   with -a, each log entry, prefixed with 7 values, separated by spaces
   without -a, just those 7 space-separated values, summarizing entries

 The column values stand for, respectively:

  + number of requests (in this aggregation)
  + total number of bytes returned (in this aggregation)
  + HTTP status (as aggregated by -s)
  + time request was made (as aggregated by -t)
  + number of seconds the request was late (as aggregated by -o or -O)
  + hostname / IP address / domain (as aggregated by -d, -1, -2 or -3)
  + path or (with -m/-p/-r) matched path substring

 and * is used to denote 'all values'.

Options:

  -a  Print all entries, prefixed with the 5 values described above.

      Without -a, print totals over all entries,
      grouped by the -d, -p, -r options if specified.

  -d  Aggregate per 'domain', the hostname or IP address with
      the most specific component stripped.
  -1  Aggregate on the least specific component of the hostname or IP address.
  -2  Aggregate on the 2 least specific components of the hostname or IP address.
  -3  Aggregate on the 3 least specific components of the hostname or IP address.

  -f FORMAT

      The input log format as specified by Apache's LogFormat.
      The default is Apache's default:

         $default_format

      Not all formats are supported (yet).
      Please note that " and \\ are not special in this string;
      for the format denoted by LogFormat "%a\\"%b", supply %a"%b.

  -F  Accept - as a field value in the input log,
      in addition to what the format specification allows.
      (Apache appears to use this for some fields.)

  -o  Aggregate by number of seconds the request was late.
      This is the number of seconds the timestamp of the request
      (which records the time the request was received)
      is earlier than the latest previous timestamp seen.
      This number being higher than 0 is an indication of
      performance problems.

  -O  Aggregate by whether the request was late or not.
      Like -o, except all non-0 values are lumped together as '+'.

  -m  regex

      Match the regex against paths and aggregate by the matched values,
      using - if the path is empty, ? if the match fails,
      and ?? if no path can be found on the log line at all.
      If the regex contains capturing brackets, uses $1 as the value;
      uses - if the path is empty.

  -p  Aggregate per 'prefix', the first component of the URL path.
      Shorthand for -m '^/*([^/?]*)'

  -r  Aggregate per 'repository', the second component of the URL path
      (which on svn.win.tue.nl is a repository name),
      Shorthand for -m '^/[^/?]*/*([^/?]*)'

  -s  Aggregate by HTTP status.

  -t FORMAT

      Aggregate by time, according to the given format, which is that of
      the POSIX strftime() function.  For instance,

        %Y stands for year
        %m stands for month number
        %b stands for abbreviated month name
        %d stands for day of the month
        %j stands for day of the year
        %a stands for abbreviated weekday name
        %H stands for hour of the day
        %M stands for minute in the hour
        %S stands for second in the minute

  -u  URI-unescape (decode) the values of -p and -r.
      E.g. this groups %7fuser and ~user together.
      (Note: lines with whitespaces in these values are misparsed.)

  -w  When a line doesn't match the expected input format,
      only warn; continue reading input.
      Without -w, report on the input so far and exit.

Environment variable:

  ACCESSLOG_AGGREGATE_FORMAT

    The value used of the -f option is not specified.
    If neither are specified, $default_format is used.
    
ZZ
  exit(0);
}

sub ehm
{
  warn join( ' ', @_ ), "\n" if $opt{v};
}

my $request_rx = qr#^([A-Z]+)\s+(.*?)\s+/*([A-Z]+/\S+)$#;
my $match_rx =
   $opt{m} ? qr($opt{m})
 : $opt{p} ? qr#^/*([^/?]*)#
 : $opt{r} ? qr#^/*[^/?]*/*([^/?]*)#
 :           qr#.*#;

my %ftime2status2domain2match2late2nr_requests;
my %ftime2status2domain2match2late2nr_bytes;

sub printf_totals
{
  my ( $ftime, $status, $domain, $match, $secslate, $line ) = @_;

  print join( ' ',
    $ftime2status2domain2match2late2nr_requests{$ftime}->{$status}->{$domain}
     ->{$match}->{$secslate},
    $ftime2status2domain2match2late2nr_bytes{$ftime}->{$status}->{$domain}
     ->{$match}->{$secslate},
    $status,
    $ftime,
    $secslate,
    $domain,
    $match,
    $line ),
   "\n";
}

sub strip_final_components
# if $d < 0, strip off that many components
# if $d > 0, leave at most that many components
{
  my ( $d, $h ) = @_;
  my @comps = split( /\./, $h );
  my @remains = splice( @comps, 0, -$d );
  my $nr_stripped = scalar(@comps);
  #join('.', @remains, split(//, '*' x $nr_stripped));  # works, but is confusing
  join( '.', @remains, !@remains ? ('*') : $nr_stripped ? ('') : () );
}

sub strip_components
{
  my ( $d, $h ) = @_;

  $h !~ /[^\d.]/
   ? strip_final_components( $d, $h )
   : scalar reverse( strip_final_components( $d, scalar reverse($h) ) );
}

sub log_line_parser
# see http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats
#
# a nice and small CPAN module, Apache::LogRegex, exists to do this,
# but it's nonstandard and I don't want to have to set up local::lib,
# so here is an alternative implementation
{
  # to parse the LogFormat specification string in $opt{f},
  # slightly leniently in case of future extensions (beyond Apache 2.4):
  # the terms all start with %, then either end in %
  # or optionally have a {}-enclosed part
  # that (I assume) cannot contain }, followed by
  # optionally ^ or >, followed by one or more alphabet characters
  my @format = split( m#(%(?:%|(?:{[^}]*})?[^a-z\s\\]?[a-z]+))#i, $opt{f} );
  # the % terms are the odd elements; if any are given, the 0th term is empty
  #warn 'tokenized format: [', join(']  [', @format), "]\n";

  # now generate a parser that scans the line using the format specifiers
  # and puts the values into a hash; the keys of the hash are the
  # format specifier including the %, and in case of duplicate specifications
  # only the *first* value is returned; e.g. if the format is '%s %>s %s',
  # the hash will have keys '%s' with the first value and '%>s'
  # like Apache::LogRegex, we do *not* do a validity check on the values,
  # we only do a simple scan guaranteed to capture the value if valid;
  # we can't guarantee this in the general case, as some specifiers
  # such as %{VARNAME}e, allow *arbitrary* values, rendering the format
  # fundamentally unparsable: we can't tell where in '%{A}e%B}e'
  # the first value ends and the second one starts; in practice,
  # constant string parts (the nonempty even values in @format)
  # are used to delimit such values so we'll assume they don't contain
  # the closing delimiter; so use a regular expression that scans
  # nongreedily, and use more specific subexpressions for nonarbitrary
  # value types; we need to use heuristics here, so the following may change
  #
  #ehm( 'log format specification elements:', map { "'$_'" } @format );
  my @rx = map {
    $_ eq ''
     ? ''    # if empty, don't include anything for it in the regex
     : !/^%/ ? "\Q$_\E"    # if nonempty constant string, match it literally
     : $_ eq '%%' ? '%'            # match % literally
     : $_ eq '%t' ? '(\[.+?\])'    # []-delimited timestamp
     : $_ eq '%r' ? '(.*?)'        # first line of request: anything
     : $_ eq '%u'
     ? '(\S+?|\S.*\S?)'            # authenticated user - can be Lucid Chairman
     : /T$/      ? '(\S+?)'        # time taken without {} is non-whitespace
     : /}[pP]$/  ? '(\d+?)'        # port number and process ID are integers
     : /^%\{.*}/ ? '(.*?)'         # all other %{...}...: anything (not really)
     :   '(\S+?)';    # everything else (including %U and %q): non-whitespace
  } @format;
  #ehm( 'log format parsing regex elements:', @rx );

  if ( $opt{F} )
  {
    # in addition to the above, also accept a hyphen in every (named) field
    @rx = map { /^(.*)\)$/ ? $1 . '|-)' : $_ } @rx;
  }

  my $rx = join( '', '^', @rx, '$' );
  ehm( 'log format parsing regex:', $rx );
  $rx = qr($rx);
  #ehm( 'log format parsing regex:', $rx );
  my @keys = map { $format[$_] } grep { $_ % 2 } 0 .. $#format;
  # the odd values of @format

  if ( grep { defined( $opt{$_} ) } qw(oOt) && !grep { $_ eq '%t' } @keys )
  {
    die
     "the log format lacks %t, so the options -t, -o and -O cannot be used\n";
  }

  sub {
    my @values = $_[0] =~ /$rx/;

    if ( scalar(@values) != scalar(@keys) )
    {
      if ( $opt{w} )
      {
        warn "input line does not match format $opt{f} at $ARGV:$.: $_[0]";
      }
      return;
    }

    my %res = map { $keys[$_] => $values[$_] } reverse( 0 .. $#values );
    # iterate over @values, which is empty in case parsing failed;
    # use reverse() to pick the last one in case of duplicates
    \%res;
   }
}

my $parse_log_line = log_line_parser();

my $secslatest_time;

sub or_else
{
  defined( $_[0] ) ? $_[0] : $_[1]    # // isn't defined on all our hosts
}

sub add
{
  $_[0] =
   defined( $_[0] ) ? $_[0] + $_[1] : $_[1]  # // isn't defined on all our hosts
}

sub formatted_time
{
  my ($time) = @_;

  if ( !$opt{t} || !defined($time) )         # should be equivalent
  {
    '*';
  }
  elsif ( my $ftime = gmtime($time)->strftime( $opt{t} ) )
  {
    $ftime;
  }
  else
  {
    '?';
  }
}

while (<>)
{
  my $l = &$parse_log_line($_);

  if ( !defined($l) )
  {

    if ( $opt{w} )
    {
      next;
    }
    else
    {
      last;
    }
  }

  #ehm('parsed line has', scalar(keys %$l), 'values');
  ehm(
    join( ', ',
      map { "$_ => " . ( defined( $l->{$_} ) ? $l->{$_} : '-' ) } keys %$l )
  );

  if ( $opt{u} && !$have_uri_escape )
  {
    die
"fatal error: cannot use -u, the Perl module URI::Escape is not installed\n";
  }

  my $status = $opt{s} ? or_else( $l->{'%>s'}, '*' ) : '*';

  my $time;
  my $secslate = '*';

  if ( $opt{t} || $opt{o} || $opt{O} )
  {
    if (
      eval {
        $time = Time::Piece->strptime( $l->{'%t'}, '[%d/%b/%Y:%H:%M:%S %z]' );
      }
     )
    {

      if ( $opt{o} || $opt{O} )
      {
        my $epoch = $time->epoch;

        if ( !defined($secslatest_time)
          || $epoch >= 0 && $epoch >= $secslatest_time )
        {
          $secslatest_time = $epoch;
          $secslate        = 0;
        }
        else
        {
          $secslate = $opt{o} ? ( $secslatest_time - $epoch ) : '+';
        }
      }
    }
    else
    {
      warn "ignoring unparseable input timestamp $l->{'%t'} in: $_";
      undef $time;
    }
  }

  my ($domain) =
     $opt{H} ? $l->{'%h'}
   : $opt{3} ? strip_components( 3,  $l->{'%h'} )
   : $opt{2} ? strip_components( 2,  $l->{'%h'} )
   : $opt{1} ? strip_components( 1,  $l->{'%h'} )
   : $opt{d} ? strip_components( -1, $l->{'%h'} )
   :           ();
  $domain = '*' if !defined($domain);

  my $match;

  if ( defined( $l->{'%r'} ) )
  {
    my ( $method, $path, $version ) = $l->{'%r'} =~ /$request_rx/;

    if ( !defined($path) )
    {
      warn "unparseable request $l->{'%r'} in: $_";
      $match = '?';
    }
    elsif ( $path eq '' )
    {
      $match = '-';
    }
    elsif ( $path =~ /$match_rx/ )
    {
      $match = $1 // $& // '?';
    }
    else
    {
      $match = '?';
    }
  }
  else
  {
    warn "cannot find request in log line\n";
    $match = '??';
  }

  if ( $opt{u} )
  {
    $match = uri_unescape($match);
  }

  my $ftime = formatted_time($time);

  ++$ftime2status2domain2match2late2nr_requests{$ftime}->{$status}->{$domain}
   ->{$match}->{$secslate};
  my $nr_bytes = or_else( $l->{'%b'}, 0 );
  $nr_bytes = 0 if $nr_bytes eq '-';
  add(
    $ftime2status2domain2match2late2nr_bytes{$ftime}->{$status}->{$domain}
     ->{$match}->{$secslate},
    $nr_bytes
  );

  if ( $opt{a} )
  {
    chomp;
    printf_totals( $ftime, $status, $domain, $match, $secslate, $_ );
  }
}

if ( !$opt{a} )
{

  foreach my $ftime ( sort keys %ftime2status2domain2match2late2nr_requests )
  {

    foreach my $status (
      sort
      keys %{ $ftime2status2domain2match2late2nr_requests{$ftime} }
     )
    {

      foreach my $domain (
        sort
        keys %{ $ftime2status2domain2match2late2nr_requests{$ftime}->{$status} }
       )
      {

        foreach my $match (
          sort
          keys %{
            $ftime2status2domain2match2late2nr_requests{$ftime}->{$status}
             ->{$domain}
          }
         )
        {

          foreach my $secslate (
            sort
            keys %{
              $ftime2status2domain2match2late2nr_requests{$ftime}->{$status}
               ->{$domain}->{$match}
            }
           )
          {
            printf_totals( $ftime, $status, $domain, $match, $secslate, '*' );
          }
        }
      }
    }
  }
}