In [64]:
from utils import *
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.decomposition import PCA

In [65]:
fd_collection = getCollection("team_5_mwdb_phase_2", "fd_collection")

In [66]:
selected_feature_model = "fc_fd"

t = int(input("Enter value of t: "))
if t < 1:
    raise ValueError("t should be a positive integer")

num_layers = int(input("Enter the number of layers: "))
if num_layers < 1:
    raise ValueError("num_layers should be a positive integer")

num_hashes_per_layer = int(input("Enter the number of hashes per layer: "))
if num_hashes_per_layer < 1:
    raise ValueError("num_hashes_per_layer should be a positive integer")

In [67]:
all_images = list(fd_collection.find())
all_images = sorted(all_images, key = lambda x: x["image_id"])

odd_image_ids = [img["image_id"] for img in all_images if img["image_id"] % 2 == 0]

even_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 == 0]
odd_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 != 0]

feature_vectors = [np.array(img[selected_feature_model]).flatten() for img in all_images]

total_len = len(feature_vectors)
even_feature_vectors = []
odd_feature_vectors = []

for i in range(total_len):
  if i % 2 == 0:
    even_feature_vectors.append(feature_vectors[i])
  else:
    odd_feature_vectors.append(feature_vectors[i])

even_feature_vectors = np.array(even_feature_vectors)
odd_feature_vectors = np.array(odd_feature_vectors)

odd_len = odd_feature_vectors.shape[0]
even_len = even_feature_vectors.shape[0]

In [68]:
import numpy as np
from collections import defaultdict

class LSHIndex:
    def __init__(self, num_layers, num_hashes, dimensions, seed=42):
        self.num_layers = num_layers
        self.num_hashes = num_hashes
        self.dimensions = dimensions
        self.index = [defaultdict(list) for _ in range(num_layers)]
        self.hash_functions = self._generate_hash_functions(seed)

    def _generate_hash_functions(self, seed):
        np.random.seed(seed)
        hash_functions = []
        for _ in range(self.num_layers):
            layer_hashes = []
            for _ in range(self.num_hashes):
                random_projection = np.random.randn(self.dimensions)
                random_projection /= np.linalg.norm(random_projection)
                layer_hashes.append(random_projection)
            hash_functions.append(layer_hashes)
        return hash_functions

    def hash_vector(self, vector):
        hashed_values = []
        for i in range(self.num_layers):
            layer_hashes = self.hash_functions[i]
            layer_hash = [int(np.dot(vector, h) > 0) for h in layer_hashes]
            hashed_values.append(tuple(layer_hash))
        return hashed_values

    def add_vector(self, vector, image_id):
        hashed = self.hash_vector(vector)
        for i in range(self.num_layers):
            self.index[i][hashed[i]].append((image_id, vector))

    def query(self, query_vector):
        hashed_query = self.hash_vector(query_vector)
        candidates = set()
        for i in range(self.num_layers):
            candidates.update(self.index[i][hashed_query[i]])
        return candidates

    def query_t_similar(self, query_vector, t):
        hashed_query = self.hash_vector(query_vector)
        candidates = []
        for i in range(self.num_layers):
            candidates.extend(self.index[i][hashed_query[i]])

        # Calculate Euclidean distance between query and candidate vectors
        distances = [np.linalg.norm(candidate[0] - query_vector) for candidate in candidates]

        # Sort candidates based on Euclidean distance and get top t similar vectors
        similar_vectors = [candidate for _, candidate in sorted(zip(distances, candidates))][:t]

        return similar_vectors

In [69]:
query_image_id = int(input("Enter value of query_image_id: "))
if query_image_id < 1:
    raise ValueError("query_image_id should be a positive integer")

num_vectors = even_feature_vectors.shape[0]
vector_dimension = even_feature_vectors.shape[1]

lsh = LSHIndex(num_layers, num_hashes_per_layer, vector_dimension)

for index, vector in enumerate(even_feature_vectors):
    lsh.add_vector(vector.tolist(), index * 2)

query_vector = np.array(odd_feature_vectors[(8675 // 2)])  # You can replace this with your query vector

result_t = lsh.query_t_similar(query_vector, t)
print(f"{t} Most Similar Vectors:")
for result in result_t:
    print(result)

15 Most Similar Vectors:
(0, [0.5122689008712769, 0.2500603497028351, 0.05646133795380592, 0.057021159678697586, 0.2841868996620178, -0.25188350677490234, -0.3758462965488434, 0.29251009225845337, -0.3238709568977356, -0.053496576845645905, 0.1960691213607788, -0.01589503511786461, -0.22548939287662506, -0.04023190215229988, -0.018099414184689522, -0.4227128326892853, 0.00019394978880882263, 0.019080746918916702, -0.5834767818450928, -0.1178731918334961, 0.13620014488697052, -0.05805489420890808, 0.028416864573955536, 0.014113139361143112, 0.20302999019622803, -0.164076566696167, -0.13326865434646606, 0.21066470444202423, 0.09049391746520996, -0.08267854154109955, -0.15908563137054443, -0.0707579031586647, -0.05239798128604889, -0.09647662937641144, -0.7233096361160278, -0.18991529941558838, 0.21813008189201355, -0.09771038591861725, -0.3487809896469116, 0.10251305997371674, 0.0668923407793045, -0.006204815581440926, 0.4081032872200012, -0.1528521329164505, -0.14640718698501587, 0.1708