#!/usr/bin/perl # # Find duplicates by Alan Mizrahi # Send any comments or suggestions to alan at mizrahi dot com dot ve # http://www.mizrahi.com.ve/projects/ # Version 1.0 2008/07/15 # use strict; use warnings; use Digest::MD5 qw(md5 md5_hex md5_base64); use Fcntl; my $BUFFSIZE = 32768; # buffer size to read files, this may be tuned my %fileByName; # all files, name => size my %fileBySize; # all files, size => [fname1, fname2, ...] my %fileByHash; # only with duplicates, hash => [fname1, fname2, ...] if ($#ARGV < 0) { print "Syntax: findDups.pl [...]\n"; exit; } # treat params as files or dirs foreach my $arg (@ARGV) { if (-d $arg) { addDir($arg); } else { addFile($arg); } } # get filesize after adding dirs, to avoid more than one stat() per file in case of symlinks, duplicate dirs, etc foreach my $fname (keys %fileByName) { $fileByName{$fname} = -s $fname; } # build hash of filesize => [ filename1, filename2, ...] foreach my $fname (keys %fileByName) { push(@{$fileBySize{$fileByName{$fname}}}, $fname); } foreach my $size (sort keys %fileBySize) { next if $#{$fileBySize{$size}} < 1; # skip filesizes array with just one file my %checking; foreach my $fname (@{$fileBySize{$size}}) { if (sysopen my $FH, $fname, O_RDONLY) { $checking{$fname}{fh} = $FH; # file handle $checking{$fname}{md5} = Digest::MD5->new; # md5 object } else { warn "Error opening $fname: $!"; } } my $read=0; while (($read < $size) && (keys %checking > 0)) { my $r; foreach my $fname (keys %checking) { # read buffer and update md5 my $buffer; $r = sysread($checking{$fname}{fh}, $buffer, $BUFFSIZE); if (! defined($r)) { warn "Error reading from $fname: $!"; close $checking{$fname}{fh}; delete $checking{$fname}; } else { $checking{$fname}{md5}->add($buffer); } } $read += $r; FILE1: foreach my $fname1 (keys %checking) { # remove files without dups my $duplicate = 0; FILE2: foreach my $fname2 (keys %checking) { # compare to each checking file next if $fname1 eq $fname2; if ($checking{$fname1}{md5}->clone->digest eq $checking{$fname2}{md5}->clone->digest) { $duplicate = 1; next FILE1; # skip to next file } } if (!$duplicate) { # remove unique file close $checking{$fname1}{fh}; delete $checking{$fname1}; } } } # these are duplicates, but there might be more than one group of md5 sums foreach my $fname (keys %checking) { close $checking{$fname}{fh}; push(@{$fileByHash{$checking{$fname}{md5}->b64digest}}, $fname); } } if (keys %fileByHash > 0) { print "Size\tHash\t\t\tFilenames\n"; } foreach my $hash (keys %fileByHash) { print $fileByName{${$fileByHash{$hash}}[0]}, "\t$hash\t", join(' ', @{$fileByHash{$hash}}),"\n"; } # add directory's content to scan (recursively) sub addDir { my $dir = $_[0]; if (!(opendir DIR, $dir)) { warn "Error opening $dir: $!"; return; } my @dents = readdir(DIR); closedir(DIR); foreach my $dent (@dents) { next if ($dent eq '.' || $dent eq '..'); $dent = $dir . '/' . $dent; if (-d $dent) { addDir($dent); } else { addFile($dent); } } } # add a file to scan sub addFile { $fileByName{$_[0]} = 0 if -f $_[0]; # only add regular files }