#!/usr/bin/env perl # # diffcluster - compare files and output file names on same line iff equal # # $Id$ # Note: it's both faster and more exact to just open the files # and read them until a difference is found. # But we may run out of file descriptors that way. # The solution to that is divide and conquer (compare first half, # then second half, then merge results). use warnings; use strict; use Getopt::Std; use Digest::MD5 qw(md5); my %opt; getopts( 'hvt:', \%opt ); HELP_MESSAGE() if $opt{'h'}; my $sep = $opt{'t'} // ' '; sub HELP_MESSAGE { print STDERR < $b } keys %size2files; foreach my $size (@sizes) { my @files = @{ $size2files{$size} }; # sort same-sized files by MD5 hash # or use a dummy my %sum; foreach my $f (@files) { if ( $size == -1 || @files == 1 ) { # use a dummy $sum{$f} = ''; } elsif ( open( IN, '<', $f ) ) { $sum{$f} = md5(); close(IN); ehm("taken MD5 hash of '$f'"); } else { $sum{f} = ''; warn "cannot take MD5 hash of '$f'\n"; } } foreach my $f (@files) { push( @{ $size2sum2files{$size}->{ $sum{$f} } }, $f ); } } my $prevsize; foreach my $size (@sizes) { foreach my $sum ( sort { $a cmp $b } keys %{ $size2sum2files{$size} } ) { print join( $sep, sort { $argnr{$a} <=> $argnr{$b} } @{ $size2sum2files{$size}->{$sum} } ), "\n"; } }