#!/usr/bin/perl # # textdist - report the text distance between input lines # # $Id$ use warnings; use strict; use Getopt::Std; use Text::CSV; #use Text::LevenshteinXS qw(distance); # conditional my %opt; getopts( 'h?f:m:pPq:s:t:', \%opt ); HELP_MESSAGE() if grep { /^[h?]/ } keys %opt; my $sep = $opt{t} // ','; my $quote = $opt{q}; my $maxdist = $opt{m}; my $maxfrac = $opt{f}; my $subst = $opt{s}; $opt{p} && $opt{P} and die "use -p, -P, or neither, but not both\n"; my $csvw = new Text::CSV( { sep_char => $sep, eol => $/, quote_char => ( $opt{'q'} // '' ) } ) or die "the CSV parser doesn't work (not installed?)\n"; my $distance = ( $opt{p} || $opt{P} ) ? do { my $diff_with_min = $opt{p}; # use a distance based on the longest common prefix # see http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings sub { my ( $minl, $maxl ) = sort { $a <=> $b } map { length } @_; my $xorred = $_[0] ^ $_[1]; $xorred =~ /^\0*/; my $d = ( $diff_with_min ? $minl : $maxl ) - $+[0]; #warn "lengths are $minl and $maxl, distance is $d\n"; $d; } } : do { # use the Levenshtein distance require Text::LevenshteinXS; \&Text::LevenshteinXS::distance }; sub HELP_MESSAGE { print STDERR <<'ZZ'; Usage: $0 [-p|-P] [-t sep] [-q quote] [-m max] [-f frac] [file(s)] to list the string distances between all pairs of input lines, in CSV format (second line, first line, distance). Unless -p or -P is specified, the Levenshtein distance is used. Options: -p The distance is the difference between the length of the shortest line and the length of their common prefix. -P The distance is the difference between the length of the longest line and the length of their common prefix. -t The column separator to use; comma by default. -q The field quote character to use; none by default. -m The maximal distance to print. -f The maximal distance to print as a fraction of the length of the shortest of the two inputs. -s expr A substitution operation to perform on the strings prior to comparing. (The original strings are used in the output.) For instance: s#^.*/## to remove paths for filenames. ZZ exit } sub substituted { local $_ = $_[0]; eval($subst); $_; } my @uniqlines; my %line2nr; my %line2subst; while (<>) { chomp; if ( !defined $line2nr{$_} ) { if ( defined $subst ) { $line2subst{$_} = substituted($_); } foreach my $l (@uniqlines) { my @comparands = defined($subst) ? ( $line2subst{$l}, $line2subst{$_} ) : ( $l, $_ ); if ( defined $maxdist || defined $maxfrac ) { # optimize: avoid comparison my @l = sort { $a <=> $b } map { length } @comparands; my $mindist = $l[1] - $l[0]; if ( !$opt{p} ) { next if defined $maxdist && $mindist > $maxdist; next if defined $maxfrac && $mindist > $maxfrac * $l[0]; } my $d = &$distance(@comparands); next if defined $maxdist && $d > $maxdist; next if defined $maxfrac && $d > $maxfrac * $l[0]; $csvw->print( *STDOUT, [ $_, $l, $d ] ); } else { my $d = &$distance(@comparands); $csvw->print( *STDOUT, [ $_, $l, $d ] ); } } push( @uniqlines, $_ ); $line2nr{$_} = scalar(@uniqlines); } }