| import numpy as np |
| from tensorflow.keras.applications import ResNet50 |
| from tensorflow.keras.preprocessing import image |
| from tensorflow.keras.applications.resnet50 import preprocess_input |
| from sklearn.metrics.pairwise import cosine_similarity |
| import os |
|
|
| |
| model = ResNet50(weights='imagenet', include_top=False, pooling='avg') |
|
|
|
|
| |
| def extract_features(img_path, model): |
| img = image.load_img(img_path, target_size=(224, 224)) |
| img_data = image.img_to_array(img) |
| img_data = np.expand_dims(img_data, axis=0) |
| img_data = preprocess_input(img_data) |
| features = model.predict(img_data) |
| return features.flatten() |
|
|
|
|
| |
| def find_duplicates(image_dir, threshold=0.9): |
| image_features = {} |
| for img_file in os.listdir(image_dir): |
| img_path = os.path.join(image_dir, img_file) |
| features = extract_features(img_path, model) |
| image_features[img_file] = features |
|
|
| feature_list = list(image_features.values()) |
| file_list = list(image_features.keys()) |
|
|
| num_images = len(file_list) |
| similarity_matrix = np.zeros((num_images, num_images)) |
|
|
| for i in range(num_images): |
| for j in range(i, num_images): |
| if i != j: |
| similarity = cosine_similarity( |
| [feature_list[i]], |
| [feature_list[j]] |
| )[0][0] |
| similarity_matrix[i][j] = similarity |
| similarity_matrix[j][i] = similarity |
|
|
| duplicates = set() |
| for i in range(num_images): |
| for j in range(i + 1, num_images): |
| if similarity_matrix[i][j] > threshold: |
| duplicates.add(file_list[j]) |
|
|
| return len(duplicates), duplicates |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
|
|
| image_dir = sys.argv[1] if len(sys.argv) > 1 else './images' |
| threshold = float(sys.argv[2]) if len(sys.argv) > 2 else 0.9 |
|
|
| count, duplicates = find_duplicates(image_dir, threshold) |
| print(f"Duplicate Images Count: {count}") |
| for duplicate in duplicates: |
| print(duplicate) |
|
|