This commit is contained in:
pranavbrkr 2023-11-28 15:30:34 -07:00
parent e97be19053
commit f5392d61e9
3 changed files with 109 additions and 95 deletions

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -68,7 +68,6 @@ def getCollection(db, collection):
client = MongoClient("mongodb://localhost:27017") client = MongoClient("mongodb://localhost:27017")
return client[db][collection] return client[db][collection]
def euclidean_distance_measure(img_1_fd, img_2_fd): def euclidean_distance_measure(img_1_fd, img_2_fd):
img_1_fd_reshaped = img_1_fd.flatten() img_1_fd_reshaped = img_1_fd.flatten()
img_2_fd_reshaped = img_2_fd.flatten() img_2_fd_reshaped = img_2_fd.flatten()
@ -86,75 +85,88 @@ valid_feature_models = {
"resnet": "resnet_fd", "resnet": "resnet_fd",
} }
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # Index of feature to split on
self.threshold = threshold # Threshold value for the feature
self.left = left # Left child node
self.right = right # Right child node
self.value = value # Class label for leaf node (if applicable)
class DecisionTree: class DecisionTree:
def __init__(self, max_depth=None): def __init__(self, max_depth=None):
self.max_depth = max_depth self.max_depth = max_depth # Maximum depth of the tree
self.tree = {} self.tree = None # Root node of the tree
def calculate_gini(self, labels): def entropy(self, y):
classes, counts = np.unique(labels, return_counts=True) _, counts = np.unique(y, return_counts=True)
probabilities = counts / len(labels) probabilities = counts / len(y)
gini = 1 - sum(probabilities ** 2) return -np.sum(probabilities * np.log2(probabilities))
return gini
def find_best_split(self, data, labels): def information_gain(self, X, y, feature, threshold):
best_gini = float('inf') left_idxs = X[:, feature] <= threshold
best_index = None right_idxs = ~left_idxs
best_value = None
for index in range(len(data[0])): left_y = y[left_idxs]
unique_values = np.unique(data[:, index]) right_y = y[right_idxs]
for value in unique_values:
left_indices = np.where(data[:, index] <= value)[0]
right_indices = np.where(data[:, index] > value)[0]
left_gini = self.calculate_gini(labels[left_indices]) p_left = len(left_y) / len(y)
right_gini = self.calculate_gini(labels[right_indices]) p_right = len(right_y) / len(y)
gini = (len(left_indices) * left_gini + len(right_indices) * right_gini) / len(data) gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
return gain
if gini < best_gini: def find_best_split(self, X, y):
best_gini = gini best_gain = 0
best_index = index best_feature = None
best_value = value best_threshold = None
return best_index, best_value for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self.information_gain(X, y, feature, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
def build_tree(self, data, labels, depth=0): return best_feature, best_threshold
if len(np.unique(labels)) == 1 or (self.max_depth and depth >= self.max_depth):
return {'class': np.argmax(np.bincount(labels))}
best_index, best_value = self.find_best_split(data, labels) def build_tree(self, X, y, depth=0):
left_indices = np.where(data[:, best_index] <= best_value)[0] if len(np.unique(y)) == 1 or depth == self.max_depth:
right_indices = np.where(data[:, best_index] > best_value)[0] return Node(value=np.argmax(np.bincount(y)))
left_subtree = self.build_tree(data[left_indices], labels[left_indices], depth + 1) best_feature, best_threshold = self.find_best_split(X, y)
right_subtree = self.build_tree(data[right_indices], labels[right_indices], depth + 1)
return {'index': best_index, 'value': best_value, if best_feature is None:
'left': left_subtree, 'right': right_subtree} return Node(value=np.argmax(np.bincount(y)))
def fit(self, data, labels): left_idxs = X[:, best_feature] <= best_threshold
self.tree = self.build_tree(data, labels) right_idxs = ~left_idxs
def predict_sample(self, sample, tree): left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
if 'class' in tree: right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
return tree['class']
if sample[tree['index']] <= tree['value']: return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
return self.predict_sample(sample, tree['left'])
def fit(self, X, y):
self.tree = self.build_tree(X, y)
def predict_instance(self, x, node):
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self.predict_instance(x, node.left)
else: else:
return self.predict_sample(sample, tree['right']) return self.predict_instance(x, node.right)
def predict(self, data): def predict(self, X):
predictions = [] predictions = []
for sample in data: for x in X:
prediction = self.predict_sample(sample, self.tree) pred = self.predict_instance(x, self.tree)
predictions.append(prediction) predictions.append(pred)
return predictions return np.array(predictions)
class LSH: class LSH:
def __init__(self, data, num_layers, num_hashes): def __init__(self, data, num_layers, num_hashes):
@ -177,7 +189,7 @@ class LSH:
hash_code = self.hash_vector(vector, seed=layer) hash_code = self.hash_vector(vector, seed=layer)
self.hash_tables[layer][hash_code].append(i) self.hash_tables[layer][hash_code].append(i)
def find_similar(self, external_image, t, threshold=0.9): def find_similar(self, external_image, t):
similar_images = set() similar_images = set()
visited_buckets = set() visited_buckets = set()
unique_images_considered = set() unique_images_considered = set()
@ -186,8 +198,15 @@ class LSH:
hash_code = self.hash_vector(external_image, seed=layer) hash_code = self.hash_vector(external_image, seed=layer)
visited_buckets.add(hash_code) visited_buckets.add(hash_code)
# Handling exact matches explicitly
if hash_code in self.hash_tables[layer]:
for idx in self.hash_tables[layer][hash_code]:
similar_images.add(idx)
unique_images_considered.add(idx)
# Searching in nearby buckets based on Hamming distance
for key in self.hash_tables[layer]: for key in self.hash_tables[layer]:
if key != hash_code and self.hamming_distance(key, hash_code) <= 2: if self.hamming_distance(key, hash_code) <= 1:
visited_buckets.add(key) visited_buckets.add(key)
for idx in self.hash_tables[layer][key]: for idx in self.hash_tables[layer][key]: