In [39]:
from utils import *
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.decomposition import PCA

In [40]:
fd_collection = getCollection("team_5_mwdb_phase_2", "fd_collection")

In [41]:
selected_feature_model = "fc_fd"

t = int(input("Enter value of t: "))
if t < 1:
    raise ValueError("t should be a positive integer")

num_layers = int(input("Enter the number of layers: "))
if num_layers < 1:
    raise ValueError("num_layers should be a positive integer")

num_hashes_per_layer = int(input("Enter the number of hashes per layer: "))
if num_hashes_per_layer < 1:
    raise ValueError("num_hashes_per_layer should be a positive integer")

In [42]:
all_images = list(fd_collection.find())
all_images = sorted(all_images, key = lambda x: x["image_id"])

odd_image_ids = [img["image_id"] for img in all_images if img["image_id"] % 2 == 0]

even_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 == 0]
odd_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 != 0]

feature_vectors = [np.array(img[selected_feature_model]).flatten() for img in all_images]

total_len = len(feature_vectors)
even_feature_vectors = []
odd_feature_vectors = []

for i in range(total_len):
  if i % 2 == 0:
    even_feature_vectors.append(feature_vectors[i])
  else:
    odd_feature_vectors.append(feature_vectors[i])

even_feature_vectors = np.array(even_feature_vectors)
odd_feature_vectors = np.array(odd_feature_vectors)

odd_len = odd_feature_vectors.shape[0]
even_len = even_feature_vectors.shape[0]

In [54]:
import numpy as np
from collections import defaultdict

class LSHIndex:
    def __init__(self, num_layers, num_hashes, dimensions):
        self.num_layers = num_layers
        self.num_hashes = num_hashes
        self.dimensions = dimensions
        self.index = [defaultdict(list) for _ in range(num_layers)]
        self.hash_functions = self._generate_hash_functions()

    def _generate_hash_functions(self):
        hash_functions = []
        for _ in range(self.num_layers):
            layer_hashes = []
            for _ in range(self.num_hashes):
                random_projection = np.random.randn(self.dimensions)
                random_projection /= np.linalg.norm(random_projection)
                layer_hashes.append(random_projection)
            hash_functions.append(layer_hashes)
        return hash_functions

    def hash_vector(self, vector):
        hashed_values = []
        for i in range(self.num_layers):
            layer_hashes = self.hash_functions[i]
            layer_hash = [int(np.dot(vector, h) > 0) for h in layer_hashes]
            hashed_values.append(tuple(layer_hash))
        return hashed_values

    def add_vector(self, vector, image_id):
        hashed = self.hash_vector(vector)
        for i in range(self.num_layers):
            self.index[i][hashed[i]].append((image_id, vector))

    def query(self, query_vector):
        hashed_query = self.hash_vector(query_vector)
        candidates = set()
        for i in range(self.num_layers):
            candidates.update(self.index[i][hashed_query[i]])
        return candidates

    def query_t_similar(self, query_vector, t):
        hashed_query = self.hash_vector(query_vector)
        candidates = []
        for i in range(self.num_layers):
            candidates.extend(self.index[i][hashed_query[i]])

        # Calculate Euclidean distance between query and candidate vectors
        distances = [np.linalg.norm(candidate[0] - query_vector) for candidate in candidates]

        # Sort candidates based on Euclidean distance and get top t similar vectors
        similar_vectors = [candidate for _, candidate in sorted(zip(distances, candidates))][:t]

        return similar_vectors

In [55]:
num_vectors = even_feature_vectors.shape[0]
vector_dimension = even_feature_vectors.shape[1]

lsh = LSHIndex(num_layers, num_hashes_per_layer, vector_dimension)

for index, vector in enumerate(even_feature_vectors):
    lsh.add_vector(vector.tolist(), index * 2)

query_vector = np.array(odd_feature_vectors[(1 // 2)])  # You can replace this with your query vector

result_t = lsh.query_t_similar(query_vector, t)
print(f"{t} Most Similar Vectors:")
for result in result_t:
    print(result)

15 Most Similar Vectors:
(20, [1.8676350116729736, -0.11808869242668152, -0.7870825529098511, -0.000766887329518795, -0.6891564130783081, -0.7333865761756897, -0.6145556569099426, 0.5133474469184875, -0.37114042043685913, -0.35024014115333557, 0.26625579595565796, -0.22692371904850006, -0.09660376608371735, -0.05850209295749664, 1.1124564409255981, 0.9980078339576721, -0.12832070887088776, 0.13986000418663025, 0.09640604257583618, 0.17902642488479614, -0.33510521054267883, 0.2255747765302658, -0.4015006422996521, -0.1690637767314911, -0.06382472068071365, 0.30264580249786377, -0.06097522750496864, 1.0156663656234741, 0.922686755657196, -0.020584329962730408, 0.2425142228603363, -0.19292673468589783, 0.8032246232032776, -0.5490302443504333, -0.006973720155656338, -0.3639185428619385, 1.0822560787200928, 0.6534451246261597, -0.18960878252983093, 1.1484874486923218, 0.6679640412330627, 0.6189641952514648, 0.1420220136642456, -0.5236685276031494, 0.6937704086303711, -0.1495991051197052, 1.