LSH task

2025-12-06 10:34:07 +00:00 · 2023-11-28 15:30:34 -07:00 · 2023-11-28 15:30:34 -07:00 · f5392d61e9
commit f5392d61e9
parent e97be19053
3 changed files with 109 additions and 95 deletions
--- a/3/decision_tree_10_150.pkl
+++ b/3/decision_tree_10_150.pkl
--- a/3/task_4.ipynb
+++ b/3/task_4.ipynb
--- a/3/utils.py
+++ b/3/utils.py
@ -68,7 +68,6 @@ def getCollection(db, collection):
    client = MongoClient("mongodb://localhost:27017")
    return client[db][collection]
 def euclidean_distance_measure(img_1_fd, img_2_fd):
    img_1_fd_reshaped = img_1_fd.flatten()
    img_2_fd_reshaped = img_2_fd.flatten()
@ -86,75 +85,88 @@ valid_feature_models = {
    "resnet": "resnet_fd",
 }
-
+class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature          # Index of feature to split on
        self.threshold = threshold      # Threshold value for the feature
        self.left = left                # Left child node
        self.right = right              # Right child node
        self.value = value              # Class label for leaf node (if applicable)
 class DecisionTree:
    def __init__(self, max_depth=None):
-        self.max_depth = max_depth
+        self.max_depth = max_depth      # Maximum depth of the tree
-        self.tree = {}
+        self.tree = None                # Root node of the tree
-    def calculate_gini(self, labels):
+    def entropy(self, y):
-        classes, counts = np.unique(labels, return_counts=True)
+        _, counts = np.unique(y, return_counts=True)
-        probabilities = counts / len(labels)
+        probabilities = counts / len(y)
-        gini = 1 - sum(probabilities ** 2)
+        return -np.sum(probabilities * np.log2(probabilities))
        return gini
-    def find_best_split(self, data, labels):
+    def information_gain(self, X, y, feature, threshold):
-        best_gini = float('inf')
+        left_idxs = X[:, feature] <= threshold
-        best_index = None
+        right_idxs = ~left_idxs
        best_value = None
-        for index in range(len(data[0])):
+        left_y = y[left_idxs]
-            unique_values = np.unique(data[:, index])
+        right_y = y[right_idxs]
            for value in unique_values:
                left_indices = np.where(data[:, index] <= value)[0]
                right_indices = np.where(data[:, index] > value)[0]
-                left_gini = self.calculate_gini(labels[left_indices])
+        p_left = len(left_y) / len(y)
-                right_gini = self.calculate_gini(labels[right_indices])
+        p_right = len(right_y) / len(y)
-                gini = (len(left_indices) * left_gini + len(right_indices) * right_gini) / len(data)
+        gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
        return gain
-                if gini < best_gini:
+    def find_best_split(self, X, y):
-                    best_gini = gini
+        best_gain = 0
-                    best_index = index
+        best_feature = None
-                    best_value = value
+        best_threshold = None
-        return best_index, best_value
+        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self.information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
-    def build_tree(self, data, labels, depth=0):
+        return best_feature, best_threshold
        if len(np.unique(labels)) == 1 or (self.max_depth and depth >= self.max_depth):
            return {'class': np.argmax(np.bincount(labels))}
-        best_index, best_value = self.find_best_split(data, labels)
+    def build_tree(self, X, y, depth=0):
-        left_indices = np.where(data[:, best_index] <= best_value)[0]
+        if len(np.unique(y)) == 1 or depth == self.max_depth:
-        right_indices = np.where(data[:, best_index] > best_value)[0]
+            return Node(value=np.argmax(np.bincount(y)))
-        left_subtree = self.build_tree(data[left_indices], labels[left_indices], depth + 1)
+        best_feature, best_threshold = self.find_best_split(X, y)
        right_subtree = self.build_tree(data[right_indices], labels[right_indices], depth + 1)
-        return {'index': best_index, 'value': best_value,
+        if best_feature is None:
-                'left': left_subtree, 'right': right_subtree}
+            return Node(value=np.argmax(np.bincount(y)))
-    def fit(self, data, labels):
+        left_idxs = X[:, best_feature] <= best_threshold
-        self.tree = self.build_tree(data, labels)
+        right_idxs = ~left_idxs
-    def predict_sample(self, sample, tree):
+        left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
-        if 'class' in tree:
+        right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
            return tree['class']
-        if sample[tree['index']] <= tree['value']:
+        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
-            return self.predict_sample(sample, tree['left'])
+    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
    def predict_instance(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self.predict_instance(x, node.left)
        else:
-            return self.predict_sample(sample, tree['right'])
+            return self.predict_instance(x, node.right)
-    def predict(self, data):
+    def predict(self, X):
        predictions = []
-        for sample in data:
+        for x in X:
-            prediction = self.predict_sample(sample, self.tree)
+            pred = self.predict_instance(x, self.tree)
-            predictions.append(prediction)
+            predictions.append(pred)
-        return predictions
+        return np.array(predictions)
 class LSH:
    def __init__(self, data, num_layers, num_hashes):
@ -177,7 +189,7 @@ class LSH:
                hash_code = self.hash_vector(vector, seed=layer)
                self.hash_tables[layer][hash_code].append(i)
-    def find_similar(self, external_image, t, threshold=0.9):
+    def find_similar(self, external_image, t):
        similar_images = set()
        visited_buckets = set()
        unique_images_considered = set()
@ -186,8 +198,15 @@ class LSH:
            hash_code = self.hash_vector(external_image, seed=layer)
            visited_buckets.add(hash_code)
            # Handling exact matches explicitly
            if hash_code in self.hash_tables[layer]:
                for idx in self.hash_tables[layer][hash_code]:
                    similar_images.add(idx)
                    unique_images_considered.add(idx)
            # Searching in nearby buckets based on Hamming distance
            for key in self.hash_tables[layer]:
-                if key != hash_code and self.hamming_distance(key, hash_code) <= 2:
+                if self.hamming_distance(key, hash_code) <= 1:
                    visited_buckets.add(key)
                    for idx in self.hash_tables[layer][key]: