LSH task

2026-03-11 23:34:05 +00:00 · 2023-11-28 15:30:34 -07:00
parent e97be19053
commit f5392d61e9
3 changed files with 109 additions and 95 deletions
--- a/3/decision_tree_10_150.pkl
+++ b/3/decision_tree_10_150.pkl
--- a/3/task_4.ipynb
+++ b/3/task_4.ipynb
--- a/3/utils.py
+++ b/3/utils.py
@@ -68,7 +68,6 @@ def getCollection(db, collection):
    client = MongoClient("mongodb://localhost:27017")
    return client[db][collection]

-
 def euclidean_distance_measure(img_1_fd, img_2_fd):
    img_1_fd_reshaped = img_1_fd.flatten()
    img_2_fd_reshaped = img_2_fd.flatten()
@@ -86,75 +85,88 @@ valid_feature_models = {
    "resnet": "resnet_fd",
 }

-
+class Node:
+    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
+        self.feature = feature          # Index of feature to split on
+        self.threshold = threshold      # Threshold value for the feature
+        self.left = left                # Left child node
+        self.right = right              # Right child node
+        self.value = value              # Class label for leaf node (if applicable)

 class DecisionTree:
    def __init__(self, max_depth=None):
-        self.max_depth = max_depth
-        self.tree = {}
-
-    def calculate_gini(self, labels):
-        classes, counts = np.unique(labels, return_counts=True)
-        probabilities = counts / len(labels)
-        gini = 1 - sum(probabilities ** 2)
-        return gini
-
-    def find_best_split(self, data, labels):
-        best_gini = float('inf')
-        best_index = None
-        best_value = None
-
-        for index in range(len(data[0])):
-            unique_values = np.unique(data[:, index])
-            for value in unique_values:
-                left_indices = np.where(data[:, index] <= value)[0]
-                right_indices = np.where(data[:, index] > value)[0]
-
-                left_gini = self.calculate_gini(labels[left_indices])
-                right_gini = self.calculate_gini(labels[right_indices])
-
-                gini = (len(left_indices) * left_gini + len(right_indices) * right_gini) / len(data)
-
-                if gini < best_gini:
-                    best_gini = gini
-                    best_index = index
-                    best_value = value
-
-        return best_index, best_value
-
-    def build_tree(self, data, labels, depth=0):
-        if len(np.unique(labels)) == 1 or (self.max_depth and depth >= self.max_depth):
-            return {'class': np.argmax(np.bincount(labels))}
-
-        best_index, best_value = self.find_best_split(data, labels)
-        left_indices = np.where(data[:, best_index] <= best_value)[0]
-        right_indices = np.where(data[:, best_index] > best_value)[0]
-
-        left_subtree = self.build_tree(data[left_indices], labels[left_indices], depth + 1)
-        right_subtree = self.build_tree(data[right_indices], labels[right_indices], depth + 1)
-
-        return {'index': best_index, 'value': best_value,
-                'left': left_subtree, 'right': right_subtree}
-
-    def fit(self, data, labels):
-        self.tree = self.build_tree(data, labels)
-
-    def predict_sample(self, sample, tree):
-        if 'class' in tree:
-            return tree['class']
+        self.max_depth = max_depth      # Maximum depth of the tree
+        self.tree = None                # Root node of the tree
+    
+    def entropy(self, y):
+        _, counts = np.unique(y, return_counts=True)
+        probabilities = counts / len(y)
+        return -np.sum(probabilities * np.log2(probabilities))
+    
+    def information_gain(self, X, y, feature, threshold):
+        left_idxs = X[:, feature] <= threshold
+        right_idxs = ~left_idxs
        
-        if sample[tree['index']] <= tree['value']:
-            return self.predict_sample(sample, tree['left'])
+        left_y = y[left_idxs]
+        right_y = y[right_idxs]
+        
+        p_left = len(left_y) / len(y)
+        p_right = len(right_y) / len(y)
+        
+        gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
+        return gain
+    
+    def find_best_split(self, X, y):
+        best_gain = 0
+        best_feature = None
+        best_threshold = None
+        
+        for feature in range(X.shape[1]):
+            thresholds = np.unique(X[:, feature])
+            for threshold in thresholds:
+                gain = self.information_gain(X, y, feature, threshold)
+                if gain > best_gain:
+                    best_gain = gain
+                    best_feature = feature
+                    best_threshold = threshold
+        
+        return best_feature, best_threshold
+    
+    def build_tree(self, X, y, depth=0):
+        if len(np.unique(y)) == 1 or depth == self.max_depth:
+            return Node(value=np.argmax(np.bincount(y)))
+        
+        best_feature, best_threshold = self.find_best_split(X, y)
+        
+        if best_feature is None:
+            return Node(value=np.argmax(np.bincount(y)))
+        
+        left_idxs = X[:, best_feature] <= best_threshold
+        right_idxs = ~left_idxs
+        
+        left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
+        right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
+        
+        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
+    
+    def fit(self, X, y):
+        self.tree = self.build_tree(X, y)
+    
+    def predict_instance(self, x, node):
+        if node.value is not None:
+            return node.value
+        
+        if x[node.feature] <= node.threshold:
+            return self.predict_instance(x, node.left)
        else:
-            return self.predict_sample(sample, tree['right'])
-
-    def predict(self, data):
+            return self.predict_instance(x, node.right)
+    
+    def predict(self, X):
        predictions = []
-        for sample in data:
-            prediction = self.predict_sample(sample, self.tree)
-            predictions.append(prediction)
-        return predictions
-
+        for x in X:
+            pred = self.predict_instance(x, self.tree)
+            predictions.append(pred)
+        return np.array(predictions)

 class LSH:
    def __init__(self, data, num_layers, num_hashes):
@@ -177,7 +189,7 @@ class LSH:
                hash_code = self.hash_vector(vector, seed=layer)
                self.hash_tables[layer][hash_code].append(i)

-    def find_similar(self, external_image, t, threshold=0.9):
+    def find_similar(self, external_image, t):
        similar_images = set()
        visited_buckets = set()
        unique_images_considered = set()
@@ -186,8 +198,15 @@ class LSH:
            hash_code = self.hash_vector(external_image, seed=layer)
            visited_buckets.add(hash_code)

+            # Handling exact matches explicitly
+            if hash_code in self.hash_tables[layer]:
+                for idx in self.hash_tables[layer][hash_code]:
+                    similar_images.add(idx)
+                    unique_images_considered.add(idx)
+
+            # Searching in nearby buckets based on Hamming distance
            for key in self.hash_tables[layer]:
-                if key != hash_code and self.hamming_distance(key, hash_code) <= 2:
+                if self.hamming_distance(key, hash_code) <= 1:
                    visited_buckets.add(key)

                    for idx in self.hash_tables[layer][key]: