Snippet: Find Duplicates
Duplikate rekursiv in allen Verzeichnissen des aktuellen Arbeitsverzeichnisses (oder beginnend in demjenigen, das als Kommandozeilenargument übergeben wurde) mithilfe von Prüfsummen finden.
Zur Ausführung mit Python:
import sys
import os
import hashlib
def find_directories(dir_path: str, found_directories: list[str]):
directories = list(filter(lambda x: os.path.isdir(x),
map(lambda x: os.path.join(dir_path, x),
filter(lambda x: not x.startswith('.') and x != '__pycache__', os.listdir(dir_path)))))
for directory in directories:
found_directories.append(directory)
find_directories(directory, found_directories)
def find_files(dir_path: str) -> list[str]:
files = list(filter(lambda x: not os.path.isdir(x),
map(lambda x: os.path.join(dir_path, x),
filter(lambda x: not x.startswith('.'), os.listdir(dir_path)))))
print('Files:', sorted(files))
return files
def calculate_checksum(file_path: str) -> str:
# https://stackoverflow.com/a/55542529
h = hashlib.sha256()
with open(file_path, 'rb') as file:
while True:
# Reading is buffered, so we can read smaller chunks
chunk = file.read(h.block_size)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def main() -> int:
found_directories = []
if len(sys.argv) > 1:
try:
find_directories(sys.argv[1], found_directories)
except FileNotFoundError:
print('No such directory: ', sys.argv[1], '', sep='\'')
return 1
else:
find_directories('.', found_directories)
checksums = []
for directory in sorted(found_directories):
print('Directory ', directory, ' found!', sep='\'')
for file in find_files(directory):
checksums.append((file, calculate_checksum(file)))
print()
results = []
for item_1 in checksums:
for item_2 in checksums:
if item_1[0] != item_2[0] and item_1[1] == item_2[1] and (item_2, item_1) not in results:
results.append((item_1, item_2))
print('The identical files are:')
if results:
for result in sorted(results, key=lambda x: x[0][0]):
print(list(map(lambda x: x[0], result)))
else:
print([])
return 0
if __name__ == '__main__':
sys.exit(main())