#!/usr/bin/env perl
#
# xref - list filenames with their occurrences in their contents as words
#
# $Id$

# framework copied from ./grep-mbox

#changes IN PROGRESS, UNTESTED:
# -t option becomes -s
# new -t option (separator)
# default separator becomes ,
# don't split input on / or \ or the -d argument
# we need to be smarter about directory paths!

use strict;
use warnings;

use Getopt::Std;
use File::Spec;

my %opt;

getopts( '01e:bdhilrs:t:v', \%opt );

my $me = do { my @vdf = File::Spec->splitpath($0); pop(@vdf); };

my $splitexpr = '[^\w.-/\\\\]+';    # don't split on directory separators

HELP_MESSAGE() if $opt{h};

my $strippaths = $opt{b};
$splitexpr = $opt{d} if defined $opt{d};
study $splitexpr;
my $headerexpr = $opt{s};
my $separator = $opt{t} // ',';
my $process =
 join( ';', grep { /./ } ( $opt{i} ? 'y/A-Z/a-z/' : '', $opt{e} || '' ) );
my $includeloops       = $opt{l};
my $addisolatesasnodes = $opt{0};
my $addallasnodes      = $opt{1};
my $verbose            = $opt{v};

die("No regular file given as argument\n")
 if ( !@ARGV );

# test the Perl fragments
foreach my $o ( 'e', 's' )
{
  defined $opt{$o}
   or next;
  local ($_) = '';
  eval( $opt{$o} );
  $@
   and die "incorrect -$o option value: $@";
}

# we need to maintain a binary relation; represent it as a hash-valued hash,
#   i.e. a mapping string -> string -> 1

my @words2match = @ARGV;

# if -h was specified, look inside the content for @words2match
if ( defined $headerexpr )
{
  my %w2m;

  local (@ARGV) = @ARGV;    # !!

  while (<>)
  {
    chomp;

    local ($_) = $_;

    if ( eval($headerexpr) )
    {
      $w2m{$_} += 1;
    }
  }

  @words2match = sort keys %w2m;
}

if ($strippaths)
{
  @words2match = map { pathstripped($_) } @words2match;
}

if ($process)
{
  @words2match = map { processed($_) } @words2match;
}

my %words2match = map { $_ => 1 } @words2match;

#ehm("file name arguments: " , join(', ', @ARGV));
ehm( "words to match: ", join( ', ', sort keys %words2match ) );

ehm( scalar(@ARGV) . ' files to process' );

printmatches(@ARGV);    # must be provided all at once for correct -0 handling

exit(0);

# auxiliary functions:

sub ehm
{
  warn join( ' ', @_ ), "\n"
   if $verbose;
}

sub HELP_MESSAGE
{
  print STDERR <<ZZ;
Usage: $me [-p] [-e expr] file [file2 ...]

  -b       causes the pathnames to be stripped (similar to -e 's#.*/##')
  -d       supplies the regexp to split on (default: $splitexpr)
  -e expr  processes the target names, using this Perl fragment,
           which is expected to modify \$_
  -t expr  instead of using the filenames as target names,
           uses this Perl fragment to find target names on the input lines;
           when the fragment succeeds (yields true), associates the name with
           the line and those that follow (until the next match)
  -i       causes the names to be lowercased (shorthand for -e 'y/A-Z/a-z/')
  -l       include loops (self-references)
  -0       print isolates (files without any reference at all), one per line
  -1       print each file found, one per line
  -v       prints some progress information

Note: -e cannot be specified multiple times - concatenate expressions with ;

Note: other than -b, which causes all pathnames to be ignored,
there is no pathname parsing: this utility ignores pathnames completely
when checking for references.  Files in different directories with the
same name are different reference sources, but a single reference target.

Note: with -h, the input file(s) are opened and read twice; therefore,
this option cannot be used wwith non-files (e.g. pipes, standard input).

ZZ
  exit(0);
}

sub pathstripped
{
  local ($_);

  my @vdf = File::Spec->splitpath( shift(@_) );

  pop(@vdf);
}

sub processed
{
  local ($_) = shift(@_);

  eval $process;
  die "$me: invalid -e expression: $@\n"
   if $@;

  $_;
}

sub treated
{
  my @words = @_;

  if ($strippaths)
  {
    @words = map { pathstripped($_) } @words;
  }

  if ($process)
  {
    @words = map { processed($_) } @words;
  }

  @words;
}

sub matched
# returns (\%seen, \@nonincluders), the latter is only filled if $addisolatesasnodes
{
  my %seen;    # the returned result; a mapping: target -> source -> number
  my %is_selfmatch;

  foreach my $f (@_)
  {
    if ( !open( IN, '<', $f ) )
    {
      warn "Cannot read from file $f: #@\n";
      next;
    }

    my $selfmatch;

    if ( !defined $headerexpr )
    {
      ($selfmatch) = treated($f);
      $is_selfmatch{$selfmatch} = 1
       if $addisolatesasnodes || $addallasnodes;
    }

    while (<IN>)
    {
      chomp;

      if ( defined($headerexpr) )
      {
        # see if a new 'file' starts on this line and determine its name
        local ($_) = $_;

        if ( eval $headerexpr )
        {
          ($selfmatch) = treated($_);
          ehm("begin at $f:$.: $selfmatch");
        }

        defined $selfmatch
         or next;

        $is_selfmatch{$selfmatch} = 1
         if $addisolatesasnodes || $addallasnodes;
      }

      my @words = grep { length } split( m#$splitexpr#, $_, -1 );
      ehm( "untreated words on $f:$.: ", join( ', ', @words ) );

      @words = treated(@words)
       or next;

      ehm( "treated words on $f:$.: ", join( ', ', @words ) );

      foreach my $match ( grep { $words2match{$_} } @words )
      {
        #warn "examining $match on $f:$.\n";

        next
         if ( !$includeloops && ( $match eq $selfmatch ) );

        ehm("found at $f:$.: $match");

        $seen{$selfmatch}->{$match} += 1;
      }
    }

    close(IN);
  }

  if ($addallasnodes)
  {
    my @nodes = keys %is_selfmatch;
    return ( \%seen, \@nodes );
  }
  elsif ($addisolatesasnodes)
  {
    my @nonincluders = grep { !defined $seen{$_} } keys %is_selfmatch;
    ehm( sprintf "%d nonincluders among %s",
      scalar(@nonincluders), join( ', ', @_ ) );
    return ( \%seen, \@nonincluders );
  }
  else
  {
    return ( \%seen, [] );
  }
}

sub printmatches
{
  # for each processed target, print their matches;
  # note that the same target may occur in multiple files
  # so we have to merge all results before we can print them

  my %seen;
  my %is_node;

  #ehm("printing matches for", @_);

  foreach my $f (@_)
  {
    ehm("printing matches for $f");
    my ( $seenhere, $nodes ) = matched($f);

    $is_node{$_} = 1 for @$nodes;

    foreach my $target ( keys %$seenhere )
    {
      foreach my $match ( keys %{ $seenhere->{$target} } )
      {
        $seen{$target}->{$match} += $seenhere->{$target}->{$match};
      }
    }
  }

  foreach my $target ( sort keys %seen )
  {
    delete $is_node{$target}
     if !$addallasnodes && $addisolatesasnodes;

    foreach my $source ( sort keys %{ $seen{$target} } )
    {
      print "$target$separator$source\n";
      delete $is_node{$source}
       if !$addallasnodes && $addisolatesasnodes;
    }
  }

  foreach my $node ( sort keys %is_node )
  {
    print "$node\n";
  }
}