#!/usr/bin/perl
#############################################################################
#                                                                           #
# Copyright (C) 1996 Michael A. Gumienny                                    #
#                                                                           #
# This program is free software; you can redistribute it and/or modify it   #
# under the terms of the GNU General Public License as published by the     #
# Free Software Foundation; either version 2 of the License, or (at your    #
# option) any later version.                                                #
#                                                                           #
# This program is distributed in the hope that it will be useful, but       #
# WITHOUT ANY WARRANTY; without even the implied warranty of                #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General #
# Public License for more details.                                          #
#                                                                           #
# You should have received a copy of the GNU General Public License along   #
# with this program; if not, write to:                                      #
#                                                                           #
#      Free Software Foundation, Inc.                                       #
#      59 Temple Place - Suite 330                                          #
#      Boston, MA 02111-1307, USA.                                          #
#                                                                           #
# Or you can find the full GNU GPL online at: http://www.gnu.org            #
#                                                                           #
# Please send your comments, updates, improvements, wishes and bug reports  #
# to:                                                                       #
#                                                                           #
#      Michael A. Gumienny           gumienny@hotmail.com                   #
#                                                                           #
#############################################################################

#############################################################################
#                                                                           #
# File:  finddups                                                           #
#                                                                           #
# $Id: finddups,v 1.2 1999/09/19 12:52:05 root Exp root $                   #
#                                                                           #
# Usage: finddups[-rdh?] [base directory]                                   #
#                                                                           #
# Description:                                                              #
#        Used to find possible duplicate files within the provided search   #
#        tree starting from the base directory, irregardless of renames or  #
#        locations.                                                         #
#                                                                           #
#        The original version included a crude CRC32 checksum routine that  #
#        works but is very slow in performance. I left it in only for fun   #
#        and for DOS users that may not have any UNIX ports of sum, cksum,  #
#        or the latest theoretically unrepeatable MD5 hash function. Guess  #
#        a script like this will tell, now won't it!                        #
#                                                                           #
#        Please use the "diff" command to verify files are absolutely       #
#        identical before any removal operations are taken.                 #
#                                                                           #
# Options:                                                                  #
#        -r    Recursively check all sub-directories from the base dir that #
#              you passed on the command line.                              #
#        -i    Use the internal CRC32 function. Only use this if nothing    #
#              better has been installed on your system. I would suggest    #
#              using the MD5 hash function. This is set below by the        #
#              variable $CRCFunc and will be used if no '-i' flag is passed.#
#        -d    Debug mode of operation, prints files CRC/hash to stdout.    #
#                                                                           #
# WARNING! Always verify any file that is reported as a duplicate with      #
#      another tool such as 'diff' before you delete it. MD5 is reported to #
#      be infinitely impossible to repeat but so was.... You get the idea!  #
#                                                                           #
# Author: Michael A. Gumienny                                               #
#                                                                           #
# Written: 1995, Re-written 1999                                            #
#                                                                           #
#############################################################################


#############################################################################
#                                                                           #
#                 User modifiable variable definitions:                     #
#                                                                           #
#   NOTE:   NOTE:  NOTE:   NOTE:   NOTE:   NOTE:   NOTE:   NOTE:   NOTE:    #
#                                                                           #
# You will more than likely use one of the following native functions of    #
# the UNIX or ported Windows functions now. These and similar CRC/hash      #
# functions typically return a line similar to the following:               #
#                                                                           #
#    (file checksum/hash value) (file size) (file name)                     #
#                                                                           #
# This script has been hard coded to ignore anything other than the first   #
# field of returned data which is the actual CRC/hash value that we need.   #
# If you need to modify the script to accept a different functions return   #
# value, you can do so at about line 176 of this script.                    #
#############################################################################
#
# Uncomment the function you wish to use.
#$CRCFunc = "/usr/bin/sum";
#$CRCFunc = "/usr/bin/cksum";
$CRCFunc = "/usr/bin/md5sum";



#############################################################################
#                                                                           #
#                Non User modifiable variable definitions:                  #
#                                                                           #
#############################################################################
# These won't be used unless you choose to use the internal CRC-32 function.
# Several polynomial variants seem to exist, but this is the most common one
# that I found in use...
$Polynomial =0xedb88320;      # [PkZip, Autodin II, Ethernet, FDDI]



#############################################################################
# &Help;                                                                    #
# This routine explains brief usage syntax to STDOUT. The program is then   #
#  terminated.                                                              #
#############################################################################
#
sub Help
  {
  printf("Usage:\tfinddups [-rd] directory\n");
  printf("\tUsed to find duplicate files irregardless of filename or location.\n\n");
  printf("\tOptions:\n");
  printf("\t-r\tRecursively check all sub-directories also.\n");
  printf("\t-i\tUse internal CRC32 function only if nothing better installed.\n");
  printf("\t-d\tdebug mode, print file CRC/hash values to stdout.\n");
  exit;
  }


#############################################################################
# getdir (root, recursiveFlag);                                             #
# Routine to gather filenames within a directory, with an optional recursive#
# flag.                                                                     #
#############################################################################
#
sub getdir
 {
 local($rootdir, $r)=@_;
 opendir(DIR, $rootdir) || die "No can do...\n";
 foreach (sort readdir(DIR))
        {
        next if (/^\.\.?$/);
        $filename = $_;
        $filename = "$rootdir/$filename";

        # root directory gets a double slash prepended so we clean it up.
        $filename =~ s/\/\//\//;

        if (!-d $filename)
                {
                if($internal)
                        {
                        # User blindly trust the internal CRC32 Function...
                        # WARNING! Some files can and will report identical
                        # CRC32 checksums. a better method is to use the
                        # newer MD5 hash function. In theory no two files
                        # will return an identical MD5 sum. You have been warned!
                        $filecrc = &GenCRC($filename);
                        }
                else
                        {
                        # User wants to use the systems better CRC/hash functions.
                        if ( !open(IN,"$CRCFunc '$filename' |") )
                                {
                                printf("Unable to read $CRCFunc or $filename for CRC generation.\n");
                                exit;
                                }

                        # Get the return value of the CRC/hash function in $filecrc
                        $filecrc = ;

                        # Chop the  off the end, do I trust the chop function?
                        # depends on the OS...
                        $filecrc =~ s/\n$//;
                        close(IN);
                        }

                # NOTE: This next line is where you may have to do some
                # changes for future compatabilitiy, system differences, etc.
                # Most sum/cksum and now md5sum functions return the crc/hash
                # value as the first field of its return value. If your
                # crc/hash function that you choose to use is different, then
                # change the following line to suit your needs accordingly...
                #  Some example methods might be
                #     ($filecrc, $junk) = split(" ", $filecrc);
                #     ($junk, $filecrc, $junk) = split(" ", $filecrc);
                ($filecrc)=split(" ", $filecrc);


                # END OF POSSIBLE MODIFICATIONS SECTION
                # The rest of the script should continue to function properly
                push(@Array, join("\t", $filecrc, $filename));
                if($debug) { print "File: $filename $filecrc\n"; }
                }

        if ((-d "$filename" && !-l "$filename") && ($r)) { &getdir("$filename", 1); }
        }
  close(DIR);
  }


#############################################################################
# This routine generates the CRC32 polynomial table when called.            #
#############################################################################
#
sub GeneratePolyTable
  {
  # Generate our polynomial table
  # 0x7fffffff compensates for signed integers within PERL
  for ($i=0; $i < 256; $i++)
        {
        $crc_accum = ( $i );
        for ($j=8; $j>0; $j--)
                {
                if ($crc_accum & 0x00000001)
                        { $crc_accum = (($crc_accum >> 1) & 0x7fffffff) ^ $Polynomial; }
                else
                        { $crc_accum = ($crc_accum >> 1) & 0x7fffffff; }
                }
        @crc_table[$i] = $crc_accum;
        }
  }


#############################################################################
# $x=&GenCRC($filename);                                                    #
# Routine generates a CRC32 sum for the given $filename.                    #
#############################################################################
#
sub GenCRC
  {
  local($filename) = @_;
  local($rpt_size, $act_size, $buffer, $i, $len);
  local($crc) = 0xffffffff;             # initial CRC value;

                $rpt_size = (stat($filename))[7];
                if ( !open(IN,"< $filename") )
                        {
                        printf("Unable to read $filename for CRC generation.\n");
                        exit;
                        }
                binmode(IN); # Added for DOS users...
                if ( !defined($act_size = read(IN, $buffer, $rpt_size)) )
                        {
                        printf("Can't read all of $filename.\n");
                        exit;
                        }
                if ( $rpt_size != $act_size )
                        { warn "Bytes read does != 'stat' size\n"; }

  $len = length($buffer);      # length of buffer whose crc will be checked
  for ($i = 0; $i < $len; $i++ )
        {
        $index = (( $crc ^ ord(substr($buffer,$i,1)) ) & 0xff);
        $crc = ( ( $crc >> 8 ) & 0x00ffffff ) ^ @crc_table[$index];
        }
  return ($crc ^ 0xffffffff);       # XOR the return value
  }


#############################################################################
# Main routine begins.                                                      #
#############################################################################
#

if($#ARGV==-1) { &Help; } # Help the user with the syntax
if(($#ARGV==0) && (@ARGV[0] !~ /^-/))
        { $dir = shift(@ARGV); }
else
        {
        foreach $arg (@ARGV)
                {
                if ($arg =~ /^-/)
                        {
                        if ($arg =~ /r/) { $recurse = 1; }
                        if ($arg =~ /d/) { $debug = 1; }
                        if ($arg =~ /i/) { $internal = 1; &GeneratePolyTable; }
                        if ($arg =~ /\?/){ &Help; }
                        if ($arg =~ /\h/){ &Help; }
                        }
                }
        shift;
        $dir = shift(@ARGV);
        }

# If no start directory was given, use the current as our base
$dir = "." unless $dir;

&getdir($dir, $recurse);

undef($Flag);
foreach $line (sort(@Array))
  {
  ($CurrentCRC, $CurrentFile) = split("\t", $line);
  if (($CurrentCRC eq $LastCRC) && (!$Flag))
    {
    ++$Flag; #=1;
    printf("--- Possible Duplicates ---\n");
    printf(" $LastFile\n");
    }
  if($Flag)
    {
    if ($CurrentCRC eq $LastCRC) { printf(" $CurrentFile\n"); }
    else  { printf("===========================\n"); undef($Flag); }
    }
  $LastFile = $CurrentFile;
  $LastCRC  = $CurrentCRC;
  }


    Source: geocities.com/fcheck2000