#!/usr/bin/env perl # # ln-r - in the given directory/ies, ln identical files to each other # # $Id$ # when are two files ln-able? this script thinks it knows use warnings; use strict; use Getopt::Std; use File::Find; use File::Compare; use Digest::MD5; $0 =~ m#[^/]+$#; my $me = $&; my %opt; getopts( 'hvdns0lae:f:xt:I', \%opt ); my $verbose = $opt{v}; my $debug = $opt{d}; my $dontact = $opt{n}; my $symbolic = $opt{s}; my $xdev = $opt{x}; my $empty2 = $opt{0}; my $sym2 = $opt{l}; my $linked2 = $opt{a}; my $e_rx = $opt{e}; my $f_rx = $opt{f}; my $sep = $opt{t}; my $fn_stdin = $opt{I}; die "error: -t must be followed by a separator\n" if defined($sep) && ( $sep =~ /^-/ || ( $sep =~ /[^.]/ && -e $sep ) ); if ( defined($e_rx) && !eval { $e_rx = qr#$e_rx# } ) { $@ =~ /^[^;]*/; die "error: -e must be followed by a valid regular expression; $&\n"; } if ( defined $f_rx ) { $f_rx = quotemeta($f_rx); $f_rx = qr($f_rx); } HELP_MESSAGE() if $opt{h} || !( @ARGV || $fn_stdin ) || ( defined $opt{x} && !defined $opt{s} ); my $csvw; # the object that will write the CSV file, if -t is specified if ( defined $sep ) { eval { require Text::CSV::Encoded } or die "cannot produce CSV, you may have to install Text::CSV::Encoded from CPAN:\n$@"; $csvw = new Text::CSV::Encoded( { sep_char => $sep, encoding => 'utf8', eol => $/ } ) or die sprintf( "cannot create CSV writer: %s\n", join( ', ', Text::CSV::Encoded->error_diag ) ); } sub HELP_MESSAGE { print STDERR < ); } if ( !@ARGV ) { # nothing to find; # at this point, this can only happen with -I, and it's fine exit(0); } my %fn2devno; # maps a filename to its device number my %fn2inode; # maps a filename to its inode number my %fn2nrbyt; # maps a filename to its size in bytes find( sub { return if !-f; # even if it's a symbolic link, it must point to a file return if !$sym2 && -l; return if -z && !$empty2; # only consider links to nonempty files if -0 my $fp = $File::Find::name; return if defined $e_rx && $fp !~ /$e_rx/; return if defined $f_rx && $fp !~ /$f_rx/; my @stat = stat; $fn2devno{$fp} = $sym2 ? (lstat)[0] : $stat[0]; $fn2inode{$fp} = $stat[1]; $fn2nrbyt{$fp} = $stat[7]; }, @ARGV ); warn scalar( keys %fn2nrbyt ) . " filenames used\n" if $debug; # note that we haven't inspected any file contents up to this point # unless $stat[7] secretly reads the file (do any filesystems do this?) # partition the files by size and, unless -x is specified, by device number; # this should be faster than *sorting* by devno and size, # but I haven't measured it my @fns_by_size = partition( ( $xdev ? sub { $fn2nrbyt{ $_[0] } <=> $fn2nrbyt{ $_[1] } } : sub { $fn2nrbyt{ $_[0] } <=> $fn2nrbyt{ $_[1] } || $fn2devno{ $_[0] } <=> $fn2devno{ $_[1] }; } ), keys %fn2devno ); my ( $nreq, $nrne ) = ( 0, 0 ); foreach my $same_dev_and_size (@fns_by_size) { next if @$same_dev_and_size < 2; # for debugging only if ( $debug && defined( my $fn = $same_dev_and_size->[0] ) ) { if ($xdev) { warn sprintf( "%d files on device %d have %d bytes, e.g. %s\n", scalar(@$same_dev_and_size), $fn2devno{$fn}, $fn2nrbyt{$fn}, $fn ); } else { warn sprintf( "%d files have %d bytes, e.g. %s\n", scalar(@$same_dev_and_size), $fn2nrbyt{$fn}, $fn ); } } # partition further by MD5 checksum my %fn2md5 = map { $_ => fn2md5($_) } @$same_dev_and_size; foreach my $same_md5 ( partition( sub { $fn2md5{ $_[0] } cmp $fn2md5{ $_[1] } }, @$same_dev_and_size ) ) { next if @$same_md5 < 2; # partition further by content equality foreach my $same_content ( partition( ( $debug ? \&counting_compare : \&compare ), @$same_md5 ) ) { next if @$same_content < 2; if ($debug) { warn sprintf( "%d files are compared, e.g. %s\n", scalar(@$same_content), $same_content->[0] ); } link_all(@$same_content); } } } if ($debug) { warn "$nreq file comparisons found equality\n"; warn "$nrne file comparisons found inequality\n"; } exit(0); sub fn2md5 { open( FILE, '<', $_[0] ) or return; binmode(FILE); my $md5 = Digest::MD5->new->addfile(*FILE)->digest; close(FILE); return $md5; } sub counting_compare { my $cmp = compare(@_); if ( $cmp == 0 ) { ++$nreq; } else { ++$nrne; } return $cmp; } sub partition # partitions a list based on a comparison function, # returning a list of arrayrefs with the partitions; # the first argument is the comparison function, # which (for compatibility with sort) should return 0 # iff the arguments belong in the same partition { my $cmp = shift(@_) or die "BUG: partition function incorrectly called\n"; return () if !@_; my @result = ( [ shift(@_) ] ); ITEM: foreach my $item (@_) { foreach my $part (@result) { if ( !&$cmp( $item, $part->[0] ) ) { push( @$part, $item ); # TODO: use linked lists instead for efficiency next ITEM; # last isn't good enough } } push( @result, [$item] ); } @result; } sub next_filename { my ( $prefix, $incrementable ) = $_[0] =~ m#^(.*?)([a-zA-Z]*[0-9]*)$#; ++$incrementable; $prefix . $incrementable; } sub link_all { my $fn = shift(@_); my $sopt = $symbolic ? ' -s' : ''; foreach my $lnable (@_) { # only include files already linked together if -= is specified next if !defined($linked2) && ( $fn2inode{$lnable} eq $fn2inode{$fn} ); if ($verbose) { warn "$me: ln$sopt $fn $lnable\n"; # ln [-s] OLD NEW } if ( defined $sep ) { # just output a table entry $csvw->print( *STDOUT, [ sort ( $fn, $lnable ) ] ); next; } # first mv it to a new, unique name, so we can undo if linking fails my $f2name = next_filename($lnable); while ( -e $f2name ) { $f2name = next_filename($lnable); } if ($dontact) { warn "would mv $lnable $f2name and ln$sopt $fn $lnable\n" if $debug; next; } warn "will mv $lnable $f2name and ln$sopt $fn $lnable\n" if $debug; if ( !rename $lnable, $f2name ) { warn "$me: failed to mv $lnable $f2name\n"; } elsif ( $symbolic ? !symlink( $fn, $lnable ) : !link( $fn, $lnable ) ) { warn "$me: failed to ln$sopt $fn $lnable\n"; # linking failed; undo rename $f2name, $lnable; } elsif ( !unlink $f2name ) { warn "$me: failed to rm $f2name, the mved $fn\n"; } } }