#!/usr/bin/env perl
#
# ln-r - in the given directory/ies, ln identical files to each other
#
# $Id$

# when are two files ln-able?  this script thinks it knows

use warnings;
use strict;

use Getopt::Std;
use File::Find;
use File::Compare;
use Digest::MD5;

$0 =~ m#[^/]+$#;
my $me = $&;

my %opt;
getopts( 'hvdns0lae:f:xt:I', \%opt );

my $verbose  = $opt{v};
my $debug    = $opt{d};
my $dontact  = $opt{n};
my $symbolic = $opt{s};
my $xdev     = $opt{x};
my $empty2   = $opt{0};
my $sym2     = $opt{l};
my $linked2  = $opt{a};
my $e_rx     = $opt{e};
my $f_rx     = $opt{f};
my $sep      = $opt{t};
my $fn_stdin = $opt{I};

die "error: -t must be followed by a separator\n"
 if defined($sep) && ( $sep =~ /^-/ || ( $sep =~ /[^.]/ && -e $sep ) );

if ( defined($e_rx) && !eval { $e_rx = qr#$e_rx# } )
{
  $@ =~ /^[^;]*/;
  die "error: -e must be followed by a valid regular expression; $&\n";
}

if ( defined $f_rx )
{
  $f_rx = quotemeta($f_rx);
  $f_rx = qr($f_rx);
}

HELP_MESSAGE()
 if $opt{h}
 || !( @ARGV || $fn_stdin )
 || ( defined $opt{x} && !defined $opt{s} );

my $csvw;    # the object that will write the CSV file, if -t is specified

if ( defined $sep )
{
  eval { require Text::CSV::Encoded }
   or die
"cannot produce CSV, you may have to install Text::CSV::Encoded from CPAN:\n$@";
  $csvw =
   new Text::CSV::Encoded( { sep_char => $sep, encoding => 'utf8', eol => $/ } )
   or die sprintf( "cannot create CSV writer: %s\n",
    join( ', ', Text::CSV::Encoded->error_diag ) );
}

sub HELP_MESSAGE
{
  print STDERR <<ZZ;

Usage:

  $me [-v|-d] [-n] [-s|-t sep] [-x] [-a] [-0] [-l] [-e rx] [-f str] [-I] file-or-directory [file-or-directory [...]]

to compare all files given or recursively in the given directory trees,
and hard link any equal files among them together as far as possible.

Options:

  -s  create symlinks instead of hard links
  -x  also consider cross-device links (only valid with -s)
  -0  include (links to) empty files
      (they are disregarded otherwise, empty files are often locks etc.)
  -v  print a link command for all links
  -d  print other progress messages
  -n  don't actually do any linking
  -l  include symbolic links
      (affecting the links themselves, not the files they point to)
  -a  include all equal pairs, i.e. also those already linked together
      (e.g. to change between hard and symbolic links)

  -e rx
     include only filenames matching the regular expression rx

  -f str
     include only filenames containing the string str

  -I
     read (additional) directory and/or file names from standard input
     (one per line)

  -t sep
      don't actually do any linking;
      instead, produce the list of equal files on output
      in two-column CSV format, using sep as the separator character
      with the columns of each row sorted lexicographically
      (the rows are not printed in sorted order)
ZZ

  exit(0);
}

if ($fn_stdin)
{
  push( @ARGV, map { chomp; $_ } <> );
}

if ( !@ARGV )
{
  # nothing to find;
  # at this point, this can only happen with -I, and it's fine
  exit(0);
}

my %fn2devno;    # maps a filename to its device number
my %fn2inode;    # maps a filename to its inode number
my %fn2nrbyt;    # maps a filename to its size in bytes

find(
  sub {
    return
     if !-f;     # even if it's a symbolic link, it must point to a file

    return
     if !$sym2 && -l;

    return
     if -z && !$empty2;    # only consider links to nonempty files if -0

    my $fp = $File::Find::name;

    return
     if defined $e_rx && $fp !~ /$e_rx/;

    return
     if defined $f_rx && $fp !~ /$f_rx/;

    my @stat = stat;

    $fn2devno{$fp} = $sym2 ? (lstat)[0] : $stat[0];
    $fn2inode{$fp} = $stat[1];
    $fn2nrbyt{$fp} = $stat[7];
  },
  @ARGV
);

warn scalar( keys %fn2nrbyt ) . " filenames used\n" if $debug;

# note that we haven't inspected any file contents up to this point
# unless $stat[7] secretly reads the file (do any filesystems do this?)

# partition the files by size and, unless -x is specified, by device number;
# this should be faster than *sorting* by devno and size,
# but I haven't measured it
my @fns_by_size = partition(
  (
    $xdev
    ? sub { $fn2nrbyt{ $_[0] } <=> $fn2nrbyt{ $_[1] } }
    : sub {
      $fn2nrbyt{ $_[0] } <=> $fn2nrbyt{ $_[1] }
       || $fn2devno{ $_[0] } <=> $fn2devno{ $_[1] };
    }
  ),
  keys %fn2devno
);

my ( $nreq, $nrne ) = ( 0, 0 );

foreach my $same_dev_and_size (@fns_by_size)
{
  next
   if @$same_dev_and_size < 2;

  # for debugging only
  if ( $debug && defined( my $fn = $same_dev_and_size->[0] ) )
  {
    if ($xdev)
    {
      warn sprintf(
        "%d files on device %d have %d bytes, e.g. %s\n",
        scalar(@$same_dev_and_size),
        $fn2devno{$fn}, $fn2nrbyt{$fn}, $fn
      );
    }
    else
    {
      warn sprintf(
        "%d files have %d bytes, e.g. %s\n",
        scalar(@$same_dev_and_size),
        $fn2nrbyt{$fn}, $fn
      );
    }
  }

  # partition further by MD5 checksum
  my %fn2md5 = map { $_ => fn2md5($_) } @$same_dev_and_size;

  foreach my $same_md5 (
    partition(
      sub { $fn2md5{ $_[0] } cmp $fn2md5{ $_[1] } },
      @$same_dev_and_size
    )
   )
  {
    next
     if @$same_md5 < 2;

    # partition further by content equality
    foreach my $same_content (
      partition( ( $debug ? \&counting_compare : \&compare ), @$same_md5 ) )
    {
      next
       if @$same_content < 2;

      if ($debug)
      {
        warn sprintf( "%d files are compared, e.g. %s\n",
          scalar(@$same_content), $same_content->[0] );
      }

      link_all(@$same_content);
    }
  }
}

if ($debug)
{
  warn "$nreq file comparisons found equality\n";
  warn "$nrne file comparisons found inequality\n";
}

exit(0);

sub fn2md5
{
  open( FILE, '<', $_[0] )
   or return;
  binmode(FILE);
  my $md5 = Digest::MD5->new->addfile(*FILE)->digest;
  close(FILE);
  return $md5;
}

sub counting_compare
{
  my $cmp = compare(@_);
  if   ( $cmp == 0 ) { ++$nreq; }
  else               { ++$nrne; }
  return $cmp;
}

sub partition

 # partitions a list based on a comparison function,
 # returning a list of arrayrefs with the partitions;
 # the first argument is the comparison function,
 # which (for compatibility with sort) should return 0
 # iff the arguments belong in the same partition
{
  my $cmp = shift(@_)
   or die "BUG: partition function incorrectly called\n";

  return ()
   if !@_;

  my @result = ( [ shift(@_) ] );

 ITEM: foreach my $item (@_)
  {
    foreach my $part (@result)
    {
      if ( !&$cmp( $item, $part->[0] ) )
      {
        push( @$part, $item );   # TODO: use linked lists instead for efficiency
        next ITEM;               # last isn't good enough
      }
    }
    push( @result, [$item] );
  }

  @result;
}

sub next_filename
{
  my ( $prefix, $incrementable ) = $_[0] =~ m#^(.*?)([a-zA-Z]*[0-9]*)$#;
  ++$incrementable;
  $prefix . $incrementable;
}

sub link_all
{
  my $fn = shift(@_);

  my $sopt = $symbolic ? ' -s' : '';

  foreach my $lnable (@_)
  {
    # only include files already linked together if -= is specified
    next
     if !defined($linked2) && ( $fn2inode{$lnable} eq $fn2inode{$fn} );

    if ($verbose)
    {
      warn "$me: ln$sopt $fn $lnable\n";    # ln [-s] OLD NEW
    }

    if ( defined $sep )
    {
      # just output a table entry
      $csvw->print( *STDOUT, [ sort ( $fn, $lnable ) ] );
      next;
    }

    # first mv it to a new, unique name, so we can undo if linking fails
    my $f2name = next_filename($lnable);

    while ( -e $f2name ) { $f2name = next_filename($lnable); }

    if ($dontact)
    {
      warn "would mv $lnable $f2name and ln$sopt $fn $lnable\n" if $debug;
      next;
    }

    warn "will mv $lnable $f2name and ln$sopt $fn $lnable\n" if $debug;

    if ( !rename $lnable, $f2name )
    {
      warn "$me: failed to mv $lnable $f2name\n";
    }
    elsif ( $symbolic ? !symlink( $fn, $lnable ) : !link( $fn, $lnable ) )
    {
      warn "$me: failed to ln$sopt $fn $lnable\n";

      # linking failed; undo
      rename $f2name, $lnable;
    }
    elsif ( !unlink $f2name )
    {
      warn "$me: failed to rm $f2name, the mved $fn\n";
    }
  }
}