In [14]:
from utils import *
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.decomposition import PCA

In [15]:
fd_collection = getCollection("team_5_mwdb_phase_2", "fd_collection")

In [16]:
selected_feature_model = "fc_fd"

t = int(input("Enter value of t: "))
if t < 1:
    raise ValueError("t should be a positive integer")

num_layers = int(input("Enter the number of layers: "))
if num_layers < 1:
    raise ValueError("num_layers should be a positive integer")

num_hashes_per_layer = int(input("Enter the number of hashes per layer: "))
if num_hashes_per_layer < 1:
    raise ValueError("num_hashes_per_layer should be a positive integer")

In [17]:
all_images = list(fd_collection.find())
all_images = sorted(all_images, key = lambda x: x["image_id"])

odd_image_ids = [img["image_id"] for img in all_images if img["image_id"] % 2 == 0]

even_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 == 0]
odd_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 != 0]

feature_vectors = [np.array(img[selected_feature_model]).flatten() for img in all_images]

total_len = len(feature_vectors)
even_feature_vectors = []
odd_feature_vectors = []

for i in range(total_len):
  if i % 2 == 0:
    even_feature_vectors.append(feature_vectors[i])
  else:
    odd_feature_vectors.append(feature_vectors[i])

even_feature_vectors = np.array(even_feature_vectors)
odd_feature_vectors = np.array(odd_feature_vectors)

odd_len = odd_feature_vectors.shape[0]
even_len = even_feature_vectors.shape[0]

In [18]:
import numpy as np
from collections import defaultdict

class LSHIndex:
    def __init__(self, num_layers, num_hashes, dimensions, seed=42):
        self.num_layers = num_layers
        self.num_hashes = num_hashes
        self.dimensions = dimensions
        self.index = [defaultdict(list) for _ in range(num_layers)]
        self.hash_functions = self._generate_hash_functions(seed)

    def _generate_hash_functions(self, seed):
        np.random.seed(seed)
        hash_functions = []
        for _ in range(self.num_layers):
            layer_hashes = []
            for _ in range(self.num_hashes):
                random_projection = np.random.randn(self.dimensions)
                random_projection /= np.linalg.norm(random_projection)
                layer_hashes.append(random_projection)
            hash_functions.append(layer_hashes)
        return hash_functions

    def hash_vector(self, vector):
        hashed_values = []
        for i in range(self.num_layers):
            layer_hashes = self.hash_functions[i]
            layer_hash = [int(np.dot(vector, h) > 0) for h in layer_hashes]
            hashed_values.append(tuple(layer_hash))
        return hashed_values

    def add_vector(self, vector, image_id):
        hashed = self.hash_vector(vector)
        for i in range(self.num_layers):
            self.index[i][hashed[i]].append((image_id, vector))

    def query(self, query_vector):
        hashed_query = self.hash_vector(query_vector)
        candidates = set()
        for i in range(self.num_layers):
            candidates.update(self.index[i][hashed_query[i]])
        return candidates

    def query_t_unique(self, query_vector, t):
        hashed_query = self.hash_vector(query_vector)
        candidates = []
        unique_vectors = set()  # Track unique vectors considered

        for i in range(self.num_layers):
            candidates.extend(self.index[i][hashed_query[i]])

        # Calculate Euclidean distance between query and candidate vectors
        distances = []
        for candidate in candidates:
            unique_vectors.add(tuple(candidate[1]))  # Adding vectors to track uniqueness
            # unique_vectors.add((candidate))  # Adding vectors to track uniqueness
            distance = np.linalg.norm(candidate[0] - query_vector)
            distances.append(distance)

        # Sort candidates based on Euclidean distance and get t unique similar vectors
        unique_similar_vectors = []
        for distance, candidate in sorted(zip(distances, candidates)):
            if len(unique_similar_vectors) >= t:
                break
            if tuple(candidate) not in unique_similar_vectors:
                unique_similar_vectors.append(tuple(candidate))

        return list(unique_similar_vectors), len(unique_vectors), len(candidates)

In [19]:
query_image_id = int(input("Enter value of query_image_id: "))
if query_image_id < 1:
    raise ValueError("query_image_id should be a positive integer")

num_vectors = even_feature_vectors.shape[0]
vector_dimension = even_feature_vectors.shape[1]

lsh = LSHIndex(num_layers, num_hashes_per_layer, vector_dimension)

for index, vector in enumerate(even_feature_vectors):
    lsh.add_vector(vector.tolist(), index * 2)

query_vector = np.array(odd_feature_vectors[(query_image_id // 2)])

result_t_unique, unique_count, overall_count = lsh.query_t_unique(query_vector, t)
print(f"Number of unique vectors considered: {unique_count}")
print(f"Overall number of vectors considered: {overall_count}")
print(f"{t} Most Similar Vectors:")
print(f"Label of the input image: {odd_image_labels[(query_image_id // 2)]}")
for result in result_t_unique:
    print(f"Label: {even_image_labels[(result[0] // 2)]}, {result}")

Number of unique vectors considered: 55
Overall number of vectors considered: 56
20 Most Similar Vectors:
Label of the input image: 30
Label: 1, (454, [0.2332000434398651, 0.07741810381412506, 0.05488063395023346, -0.34052515029907227, -0.22374911606311798, -0.5841524004936218, -0.6228158473968506, -0.081851527094841, -0.25591951608657837, -0.2172556221485138, -0.22115090489387512, -0.40955448150634766, -0.40356117486953735, -0.5474131107330322, -0.2764815390110016, -0.48939070105552673, -0.38801583647727966, -0.36290353536605835, -0.3398980498313904, -0.2590472400188446, -0.31475523114204407, -0.34429046511650085, -0.18438300490379333, -0.3244428038597107, -0.18492211401462555, -0.4719095528125763, -0.4257151484489441, -0.37070584297180176, 0.18500417470932007, -0.37800002098083496, -0.4757663607597351, -0.4288094937801361, -0.37333470582962036, -0.254594624042511, -0.793333888053894, -0.603157103061676, 0.12041875720024109, 0.12864024937152863, -0.332401305437088, 0.17326293885707855