#!/usr/bin/env perl # # xref - list filenames with their occurrences in their contents as words # # $Id$ # framework copied from ./grep-mbox #changes IN PROGRESS, UNTESTED: # -t option becomes -s # new -t option (separator) # default separator becomes , # don't split input on / or \ or the -d argument # we need to be smarter about directory paths! use strict; use warnings; use Getopt::Std; use File::Spec; my %opt; getopts( '01e:bdhilrs:t:v', \%opt ); my $me = do { my @vdf = File::Spec->splitpath($0); pop(@vdf); }; my $splitexpr = '[^\w.-/\\\\]+'; # don't split on directory separators HELP_MESSAGE() if $opt{h}; my $strippaths = $opt{b}; $splitexpr = $opt{d} if defined $opt{d}; study $splitexpr; my $headerexpr = $opt{s}; my $separator = $opt{t} // ','; my $process = join( ';', grep { /./ } ( $opt{i} ? 'y/A-Z/a-z/' : '', $opt{e} || '' ) ); my $includeloops = $opt{l}; my $addisolatesasnodes = $opt{0}; my $addallasnodes = $opt{1}; my $verbose = $opt{v}; die("No regular file given as argument\n") if ( !@ARGV ); # test the Perl fragments foreach my $o ( 'e', 's' ) { defined $opt{$o} or next; local ($_) = ''; eval( $opt{$o} ); $@ and die "incorrect -$o option value: $@"; } # we need to maintain a binary relation; represent it as a hash-valued hash, # i.e. a mapping string -> string -> 1 my @words2match = @ARGV; # if -h was specified, look inside the content for @words2match if ( defined $headerexpr ) { my %w2m; local (@ARGV) = @ARGV; # !! while (<>) { chomp; local ($_) = $_; if ( eval($headerexpr) ) { $w2m{$_} += 1; } } @words2match = sort keys %w2m; } if ($strippaths) { @words2match = map { pathstripped($_) } @words2match; } if ($process) { @words2match = map { processed($_) } @words2match; } my %words2match = map { $_ => 1 } @words2match; #ehm("file name arguments: " , join(', ', @ARGV)); ehm( "words to match: ", join( ', ', sort keys %words2match ) ); ehm( scalar(@ARGV) . ' files to process' ); printmatches(@ARGV); # must be provided all at once for correct -0 handling exit(0); # auxiliary functions: sub ehm { warn join( ' ', @_ ), "\n" if $verbose; } sub HELP_MESSAGE { print STDERR <splitpath( shift(@_) ); pop(@vdf); } sub processed { local ($_) = shift(@_); eval $process; die "$me: invalid -e expression: $@\n" if $@; $_; } sub treated { my @words = @_; if ($strippaths) { @words = map { pathstripped($_) } @words; } if ($process) { @words = map { processed($_) } @words; } @words; } sub matched # returns (\%seen, \@nonincluders), the latter is only filled if $addisolatesasnodes { my %seen; # the returned result; a mapping: target -> source -> number my %is_selfmatch; foreach my $f (@_) { if ( !open( IN, '<', $f ) ) { warn "Cannot read from file $f: #@\n"; next; } my $selfmatch; if ( !defined $headerexpr ) { ($selfmatch) = treated($f); $is_selfmatch{$selfmatch} = 1 if $addisolatesasnodes || $addallasnodes; } while () { chomp; if ( defined($headerexpr) ) { # see if a new 'file' starts on this line and determine its name local ($_) = $_; if ( eval $headerexpr ) { ($selfmatch) = treated($_); ehm("begin at $f:$.: $selfmatch"); } defined $selfmatch or next; $is_selfmatch{$selfmatch} = 1 if $addisolatesasnodes || $addallasnodes; } my @words = grep { length } split( m#$splitexpr#, $_, -1 ); ehm( "untreated words on $f:$.: ", join( ', ', @words ) ); @words = treated(@words) or next; ehm( "treated words on $f:$.: ", join( ', ', @words ) ); foreach my $match ( grep { $words2match{$_} } @words ) { #warn "examining $match on $f:$.\n"; next if ( !$includeloops && ( $match eq $selfmatch ) ); ehm("found at $f:$.: $match"); $seen{$selfmatch}->{$match} += 1; } } close(IN); } if ($addallasnodes) { my @nodes = keys %is_selfmatch; return ( \%seen, \@nodes ); } elsif ($addisolatesasnodes) { my @nonincluders = grep { !defined $seen{$_} } keys %is_selfmatch; ehm( sprintf "%d nonincluders among %s", scalar(@nonincluders), join( ', ', @_ ) ); return ( \%seen, \@nonincluders ); } else { return ( \%seen, [] ); } } sub printmatches { # for each processed target, print their matches; # note that the same target may occur in multiple files # so we have to merge all results before we can print them my %seen; my %is_node; #ehm("printing matches for", @_); foreach my $f (@_) { ehm("printing matches for $f"); my ( $seenhere, $nodes ) = matched($f); $is_node{$_} = 1 for @$nodes; foreach my $target ( keys %$seenhere ) { foreach my $match ( keys %{ $seenhere->{$target} } ) { $seen{$target}->{$match} += $seenhere->{$target}->{$match}; } } } foreach my $target ( sort keys %seen ) { delete $is_node{$target} if !$addallasnodes && $addisolatesasnodes; foreach my $source ( sort keys %{ $seen{$target} } ) { print "$target$separator$source\n"; delete $is_node{$source} if !$addallasnodes && $addisolatesasnodes; } } foreach my $node ( sort keys %is_node ) { print "$node\n"; } }