#!/usr/bin/perl
#
# textdist - report the text distance between input lines
#
# $Id$

use warnings;
use strict;

use Getopt::Std;
use Text::CSV;
#use Text::LevenshteinXS qw(distance);  # conditional

my %opt;
getopts( 'h?f:m:pPq:s:t:', \%opt );

HELP_MESSAGE() if grep { /^[h?]/ } keys %opt;

my $sep     = $opt{t} // ',';
my $quote   = $opt{q};
my $maxdist = $opt{m};
my $maxfrac = $opt{f};
my $subst   = $opt{s};

$opt{p} && $opt{P} and die "use -p, -P, or neither, but not both\n";

my $csvw = new Text::CSV(
  { sep_char => $sep, eol => $/, quote_char => ( $opt{'q'} // '' ) } )
 or die "the CSV parser doesn't work (not installed?)\n";

my $distance = ( $opt{p} || $opt{P} )
 ? do
{
  my $diff_with_min = $opt{p};
  # use a distance based on the longest common prefix
  # see http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
  sub {
    my ( $minl, $maxl ) = sort { $a <=> $b } map { length } @_;
    my $xorred = $_[0] ^ $_[1];
    $xorred =~ /^\0*/;
    my $d = ( $diff_with_min ? $minl : $maxl ) - $+[0];
    #warn "lengths are $minl and $maxl, distance is $d\n";
    $d;
   }
 }
 : do
{
  # use the Levenshtein distance
  require Text::LevenshteinXS;
  \&Text::LevenshteinXS::distance
};

sub HELP_MESSAGE
{
  print STDERR <<'ZZ';

Usage: $0 [-p|-P] [-t sep] [-q quote] [-m max] [-f frac] [file(s)]

to list the string distances between all pairs of input lines,
in CSV format (second line, first line, distance).
Unless -p or -P is specified, the Levenshtein distance is used.

Options:

  -p  The distance is the difference between the length of the
      shortest line and the length of their common prefix.
  -P  The distance is the difference between the length of the
      longest line and the length of their common prefix.
  -t  The column separator to use; comma by default.
  -q  The field quote character to use; none by default.
  -m  The maximal distance to print.
  -f  The maximal distance to print as a fraction of the length
      of the shortest of the two inputs.

  -s expr
      A substitution operation to perform on the strings
      prior to comparing.  (The original strings are used in the output.)
      For instance: s#^.*/## to remove paths for filenames.
ZZ
  exit
}

sub substituted
{
  local $_ = $_[0];
  eval($subst);
  $_;
}

my @uniqlines;
my %line2nr;
my %line2subst;

while (<>)
{
  chomp;

  if ( !defined $line2nr{$_} )
  {

    if ( defined $subst )
    {
      $line2subst{$_} = substituted($_);
    }

    foreach my $l (@uniqlines)
    {
      my @comparands =
       defined($subst) ? ( $line2subst{$l}, $line2subst{$_} ) : ( $l, $_ );

      if ( defined $maxdist || defined $maxfrac )
      {
        # optimize: avoid comparison
        my @l = sort { $a <=> $b } map { length } @comparands;
        my $mindist = $l[1] - $l[0];

        if ( !$opt{p} )
        {
          next if defined $maxdist && $mindist > $maxdist;
          next if defined $maxfrac && $mindist > $maxfrac * $l[0];
        }
        my $d = &$distance(@comparands);
        next if defined $maxdist && $d > $maxdist;
        next if defined $maxfrac && $d > $maxfrac * $l[0];
        $csvw->print( *STDOUT, [ $_, $l, $d ] );
      }
      else
      {
        my $d = &$distance(@comparands);
        $csvw->print( *STDOUT, [ $_, $l, $d ] );
      }
    }
    push( @uniqlines, $_ );
    $line2nr{$_} = scalar(@uniqlines);
  }
}