A sophisticated duplicate file finder that recursively searches directories, identifies files with identical sizes, then performs binary comparison to find exact duplicates. Generates a batch file (killdupe.cmd) with deletion commands for duplicate files, keeping the first occurrence. Uses efficient two-pass algorithm: first by file size, then by binary content comparison.
perl killdupe.pl <directory> [output_file]
Scans directory recursively, finds duplicates by size and binary comparison, outputs deletion commands to killdupe.cmd (or specified file).
#!/usr/bin/perl
#
sub ft {
my(@files);
my($root) = $_[0];
print "exploring $root\n";
opendir ROOT, $root;
my(@filelist) = readdir ROOT;
closedir ROOT;
foreach $name (@filelist) {
if ($name ne '.' and $name ne '..') {
$name = $root . "\\" . $name;
@files = (@files, $name) if (-f $name);
@files = (@files, ft($name)) if (-d $name);
}
}
return @files;
}
sub ss {
my(@files) = @_;
return if (!@files);
my(%sizes, %samesizes);
foreach $file (@files) {
$size = (stat($file))[7];
next if !($size);
if ($sizes{$size}) {
$samesizes{$size} = $samesizes{$size} ? $samesizes{$size} . '|' . $file : $sizes{$size} . '|' . $file;
} else {
$sizes{$size} = $file;
}
}
return %samesizes;
}
sub fc {
my($first, $second) = @_;
open (FIRST, $first) or return 0;
open (SECOND, $second) or return 0;
binmode FIRST;
binmode SECOND;
while (sysread(FIRST, $f, 65536) && sysread(SECOND, $s, 65536)) {
return 0 unless ($f eq $s);
}
return 1;
}
sub ns {
return split(/\\/, $a) > split(/\\/, $b) ? +1 : -1 unless split(/\\/, $a) == split(/\\/, $b);
return $a gt $b ? +1 : -1;
}
sub sf {
my($first, @rest) = sort ns @_;
return if (!@rest);
my(%samefiles);
foreach $key (keys @rest) {
$second = $rest[$key];
if (fc($first, $second)) {
$samefiles{$first} = $samefiles{$first} ? $samefiles{$first} . "|" . $second : $second;
$rest[$key] = '';
}
}
%samefiles = (%samefiles, sf(@rest));
return %samefiles;
}
die "usage: killdupe.pl [out]" unless ($ARGV[0]);
die "'$ARGV[0]' is not dir!" unless (-d $ARGV[0]);
print "searching files...\n";
@files = ft($ARGV[0]);
$n = @files;
print "found $n files.\n";
print "----------------------------------------------\n";
print "searching same size files...\n";
%samesizes = ss(@files);
$n = keys %samesizes;
print "found $n same size files.\n";
print "----------------------------------------------\n";
print "examining these files...\n";
$out = $ARGV[1] or $out = 'killdupe.cmd';
open (OUT, ">$out");
foreach $size (keys %samesizes) {
@samesize = split(/\|/, $samesizes{$size});
%samefiles = sf(@samesize);
foreach $original (keys %samefiles) {
$n = @dupes = split(/\|/, $samefiles{$original});
print "'$original' has $n dupes:\n";
print OUT "rem \"$original\"\n";
foreach $dupe (@dupes) {
print "'$dupe'\n";
print OUT "del \"$dupe\"\n";
}
print "----------------------------------------------\n";
}
}
close (OUT);
print "done.\n";