Snippet: Find Duplicates
Duplikate rekursiv in allen Unterverzeichnissen beginnend in dem, das als Kommandozeilenargument übergeben wurde (oder im aktuellen Arbeitsverzeichnis), mithilfe von Prüfsummen (SHA-256) finden.
Zur Ausführung mit Python:
#!/usr/bin/env python3
import sys
import os
import hashlib
class Directory:
def __init__(self, path: str) -> None:
self.path = path
entries = self.get_directory_entries(path)
self.directories = self.get_directories(entries)
self.files = self.get_files(entries)
def get_directory_entries(self, directory_path: str) -> list[str]:
return list(map(lambda entry: os.path.join(directory_path, entry),
filter(lambda entry: not entry.startswith('.'), os.listdir(directory_path))))
def get_directories(self, entries: list[str]) -> list['Directory']:
return list(map(lambda directory: Directory(directory),
filter(lambda entry: os.path.isdir(entry), entries)))
def get_files(self, entries: list[str]) -> list['File']:
return list(map(lambda file: File(file),
filter(lambda entry: not os.path.isdir(entry), entries)))
class File:
checksums: dict[str, list[str]] = {}
def __init__(self, path: str) -> None:
self.path = path
self.checksum = self.get_file_checksum(path)
if self.checksum not in self.checksums:
self.checksums[self.checksum] = [self.path]
else:
self.checksums[self.checksum].append(self.path)
def get_file_checksum(self, file_path: str) -> str:
with open(file_path, 'rb') as file:
digest = hashlib.file_digest(file, 'sha256')
return digest.hexdigest()
def print_directories(directory: Directory, directory_path: str, subdirectory=False) -> None:
if not subdirectory:
print('Directory:', '\'' + directory.path + '\'')
else:
print('Subdirectory:', '\'.' + directory.path.removeprefix(directory_path) + '\'')
print_files(list(map(lambda file: os.path.basename(file.path), directory.files)))
print()
for subdirectory in directory.directories:
print_directories(subdirectory, directory_path, True)
def print_files(files: list[str]) -> None:
if len(files) > 0:
print('Files: [',
',\n'.join(map(lambda file: ' \'' + file + '\'', files)),
']', sep='\n')
else:
print('No files found in this directory!')
def print_identical_files(directory_path: str) -> None:
identical_files = dict(filter(lambda checksum: len(checksum[1]) > 1, File.checksums.items()))
if len(identical_files) > 0:
print('\nThe identical files are (found using SHA-256):')
for checksum in identical_files:
print('\nChecksum:', '\'' + checksum + '\'')
print_files(list(map(lambda file: '.' + file.removeprefix(directory_path),
File.checksums[checksum])))
else:
print('\nNo identical files found (using SHA-256)!')
def main() -> int:
directory_path: str
if len(sys.argv) <= 1:
print('No directory was passed as an argument!')
print('The current working directory is used instead …\n')
directory_path = os.getcwd()
else:
print('A directory was passed as an argument …\n')
arguments = sys.argv
arguments.pop(0)
directory_path = os.path.abspath(' '.join(arguments))
try:
directory = Directory(directory_path)
except (FileNotFoundError, NotADirectoryError) as error:
print('But an error occurred:', end=' ')
if type(error) == FileNotFoundError:
print('No such directory!')
elif type(error) == NotADirectoryError:
print('Not a directory!')
print('Argument:', '\'' + directory_path + '\'')
return 1
else:
print_directories(directory, directory_path)
print_identical_files(directory_path)
return 0
if __name__ == '__main__':
sys.exit(main())