Snippet: Find Duplicates

Duplikate rekursiv in allen Verzeichnissen des aktuellen Arbeitsverzeichnisses (oder beginnend in demjenigen, das als Kommandozeilenargument übergeben wurde) mithilfe von Prüfsummen finden.

Zur Ausführung mit Python:

import sys
import os
import hashlib


def find_directories(dir_path: str, found_directories: list[str]):
    directories = list(filter(lambda x: os.path.isdir(x),
                              map(lambda x: os.path.join(dir_path, x),
                                  filter(lambda x: not x.startswith('.') and x != '__pycache__', os.listdir(dir_path)))))

    for directory in directories:
        found_directories.append(directory)
        find_directories(directory, found_directories)


def find_files(dir_path: str) -> list[str]:
    files = list(filter(lambda x: not os.path.isdir(x),
                        map(lambda x: os.path.join(dir_path, x),
                            filter(lambda x: not x.startswith('.'), os.listdir(dir_path)))))

    print('Files:', sorted(files))
    return files


def calculate_checksum(file_path: str) -> str:
    # https://stackoverflow.com/a/55542529
    h = hashlib.sha256()

    with open(file_path, 'rb') as file:
        while True:
            # Reading is buffered, so we can read smaller chunks
            chunk = file.read(h.block_size)
            if not chunk:
                break
            h.update(chunk)

    return h.hexdigest()


def main() -> int:
    found_directories = []
    if len(sys.argv) > 1:
        try:
            find_directories(sys.argv[1], found_directories)
        except FileNotFoundError:
            print('No such directory: ', sys.argv[1], '', sep='\'')
            return 1
    else:
        find_directories('.', found_directories)

    checksums = []
    for directory in sorted(found_directories):
        print('Directory ', directory, ' found!', sep='\'')
        for file in find_files(directory):
            checksums.append((file, calculate_checksum(file)))
        print()

    results = []
    for item_1 in checksums:
        for item_2 in checksums:
            if item_1[0] != item_2[0] and item_1[1] == item_2[1] and (item_2, item_1) not in results:
                results.append((item_1, item_2))

    print('The identical files are:')
    if results:
        for result in sorted(results, key=lambda x: x[0][0]):
            print(list(map(lambda x: x[0], result)))
    else:
        print([])

    return 0


if __name__ == '__main__':
    sys.exit(main())

GitHub Gistopen in new window