mirror of
https://github.com/20kaushik02/CSE515_MWDB_Project.git
synced 2025-12-06 12:44:06 +00:00
Decision tree and lsh update
This commit is contained in:
parent
a10336e5af
commit
8c8af8224c
Binary file not shown.
8835
Phase 3/task_3.ipynb
8835
Phase 3/task_3.ipynb
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
251
Phase 3/utils.py
251
Phase 3/utils.py
@ -8,12 +8,8 @@ from scipy.stats import pearsonr
|
|||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
# from scipy.sparse.linalg import svds
|
|
||||||
# from sklearn.decomposition import NMF
|
|
||||||
from sklearn.decomposition import LatentDirichletAllocation
|
from sklearn.decomposition import LatentDirichletAllocation
|
||||||
|
|
||||||
# from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
# Torch
|
# Torch
|
||||||
import torch
|
import torch
|
||||||
import torchvision.transforms as transforms
|
import torchvision.transforms as transforms
|
||||||
@ -40,8 +36,6 @@ from pymongo import MongoClient
|
|||||||
# Visualizing
|
# Visualizing
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
valid_classification_methods = {
|
valid_classification_methods = {
|
||||||
"m-nn": 1,
|
"m-nn": 1,
|
||||||
"decision-tree": 2,
|
"decision-tree": 2,
|
||||||
@ -71,154 +65,131 @@ valid_feature_models = {
|
|||||||
"resnet": "resnet_fd",
|
"resnet": "resnet_fd",
|
||||||
}
|
}
|
||||||
|
|
||||||
class Node:
|
|
||||||
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
|
|
||||||
self.feature = feature
|
|
||||||
self.threshold = threshold
|
|
||||||
self.left = left
|
|
||||||
self.right = right
|
|
||||||
self.value = value
|
|
||||||
|
|
||||||
class DecisionTree:
|
class DecisionTree:
|
||||||
def __init__(self, max_depth=None):
|
def __init__(self, max_depth=None):
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
self.tree = None
|
self.tree = {}
|
||||||
|
|
||||||
def entropy(self, y):
|
|
||||||
_, counts = np.unique(y, return_counts=True)
|
|
||||||
probabilities = counts / len(y)
|
|
||||||
return -np.sum(probabilities * np.log2(probabilities))
|
|
||||||
|
|
||||||
def information_gain(self, X, y, feature, threshold):
|
|
||||||
left_idxs = X[:, feature] <= threshold
|
|
||||||
right_idxs = ~left_idxs
|
|
||||||
|
|
||||||
left_y = y[left_idxs]
|
|
||||||
right_y = y[right_idxs]
|
|
||||||
|
|
||||||
p_left = len(left_y) / len(y)
|
|
||||||
p_right = len(right_y) / len(y)
|
|
||||||
|
|
||||||
gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
|
|
||||||
return gain
|
|
||||||
|
|
||||||
def find_best_split(self, X, y):
|
|
||||||
best_gain = 0
|
|
||||||
best_feature = None
|
|
||||||
best_threshold = None
|
|
||||||
|
|
||||||
for feature in range(X.shape[1]):
|
|
||||||
thresholds = np.unique(X[:, feature])
|
|
||||||
for threshold in thresholds:
|
|
||||||
gain = self.information_gain(X, y, feature, threshold)
|
|
||||||
if gain > best_gain:
|
|
||||||
best_gain = gain
|
|
||||||
best_feature = feature
|
|
||||||
best_threshold = threshold
|
|
||||||
|
|
||||||
return best_feature, best_threshold
|
|
||||||
|
|
||||||
def build_tree(self, X, y, depth=0):
|
|
||||||
if len(np.unique(y)) == 1 or depth == self.max_depth:
|
|
||||||
return Node(value=np.argmax(np.bincount(y)))
|
|
||||||
|
|
||||||
best_feature, best_threshold = self.find_best_split(X, y)
|
|
||||||
|
|
||||||
if best_feature is None:
|
|
||||||
return Node(value=np.argmax(np.bincount(y)))
|
|
||||||
|
|
||||||
left_idxs = X[:, best_feature] <= best_threshold
|
|
||||||
right_idxs = ~left_idxs
|
|
||||||
|
|
||||||
left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
|
|
||||||
right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
|
|
||||||
|
|
||||||
return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
|
|
||||||
|
|
||||||
def fit(self, X, y):
|
|
||||||
X = np.array(X) # Convert to NumPy array
|
|
||||||
y = np.array(y) # Convert to NumPy array
|
|
||||||
self.tree = self.build_tree(X, y)
|
|
||||||
|
|
||||||
def predict_instance(self, x, node):
|
|
||||||
if node.value is not None:
|
|
||||||
return node.value
|
|
||||||
|
|
||||||
if x[node.feature] <= node.threshold:
|
|
||||||
return self.predict_instance(x, node.left)
|
|
||||||
else:
|
|
||||||
return self.predict_instance(x, node.right)
|
|
||||||
|
|
||||||
def predict(self, X):
|
|
||||||
X = np.array(X) # Convert to NumPy array
|
|
||||||
predictions = []
|
|
||||||
for x in X:
|
|
||||||
pred = self.predict_instance(x, self.tree)
|
|
||||||
predictions.append(pred)
|
|
||||||
return np.array(predictions)
|
|
||||||
|
|
||||||
class LSHIndex:
|
def calculate_gini(self, labels):
|
||||||
def __init__(self, num_layers, num_hashes, dimensions, seed=42):
|
classes, counts = np.unique(labels, return_counts=True)
|
||||||
|
probabilities = counts / len(labels)
|
||||||
|
gini = 1 - sum(probabilities ** 2)
|
||||||
|
return gini
|
||||||
|
|
||||||
|
def find_best_split(self, data, labels):
|
||||||
|
best_gini = float('inf')
|
||||||
|
best_index = None
|
||||||
|
best_value = None
|
||||||
|
|
||||||
|
for index in range(len(data[0])):
|
||||||
|
unique_values = np.unique(data[:, index])
|
||||||
|
for value in unique_values:
|
||||||
|
left_indices = np.where(data[:, index] <= value)[0]
|
||||||
|
right_indices = np.where(data[:, index] > value)[0]
|
||||||
|
|
||||||
|
left_gini = self.calculate_gini(labels[left_indices])
|
||||||
|
right_gini = self.calculate_gini(labels[right_indices])
|
||||||
|
|
||||||
|
gini = (len(left_indices) * left_gini + len(right_indices) * right_gini) / len(data)
|
||||||
|
|
||||||
|
if gini < best_gini:
|
||||||
|
best_gini = gini
|
||||||
|
best_index = index
|
||||||
|
best_value = value
|
||||||
|
|
||||||
|
return best_index, best_value
|
||||||
|
|
||||||
|
def build_tree(self, data, labels, depth=0):
|
||||||
|
if len(np.unique(labels)) == 1 or (self.max_depth and depth >= self.max_depth):
|
||||||
|
return {'class': np.argmax(np.bincount(labels))}
|
||||||
|
|
||||||
|
best_index, best_value = self.find_best_split(data, labels)
|
||||||
|
left_indices = np.where(data[:, best_index] <= best_value)[0]
|
||||||
|
right_indices = np.where(data[:, best_index] > best_value)[0]
|
||||||
|
|
||||||
|
left_subtree = self.build_tree(data[left_indices], labels[left_indices], depth + 1)
|
||||||
|
right_subtree = self.build_tree(data[right_indices], labels[right_indices], depth + 1)
|
||||||
|
|
||||||
|
return {'index': best_index, 'value': best_value,
|
||||||
|
'left': left_subtree, 'right': right_subtree}
|
||||||
|
|
||||||
|
def fit(self, data, labels):
|
||||||
|
self.tree = self.build_tree(data, labels)
|
||||||
|
|
||||||
|
def predict_sample(self, sample, tree):
|
||||||
|
if 'class' in tree:
|
||||||
|
return tree['class']
|
||||||
|
|
||||||
|
if sample[tree['index']] <= tree['value']:
|
||||||
|
return self.predict_sample(sample, tree['left'])
|
||||||
|
else:
|
||||||
|
return self.predict_sample(sample, tree['right'])
|
||||||
|
|
||||||
|
def predict(self, data):
|
||||||
|
predictions = []
|
||||||
|
for sample in data:
|
||||||
|
prediction = self.predict_sample(sample, self.tree)
|
||||||
|
predictions.append(prediction)
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
|
||||||
|
class LSH:
|
||||||
|
def __init__(self, data, num_layers, num_hashes):
|
||||||
|
self.data = data
|
||||||
self.num_layers = num_layers
|
self.num_layers = num_layers
|
||||||
self.num_hashes = num_hashes
|
self.num_hashes = num_hashes
|
||||||
self.dimensions = dimensions
|
self.hash_tables = [defaultdict(list) for _ in range(num_layers)]
|
||||||
self.index = [defaultdict(list) for _ in range(num_layers)]
|
self.unique_images_considered = set()
|
||||||
self.hash_functions = self._generate_hash_functions(seed)
|
self.overall_images_considered = set()
|
||||||
|
self.create_hash_tables()
|
||||||
|
|
||||||
def _generate_hash_functions(self, seed):
|
def hash_vector(self, vector, seed):
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
hash_functions = []
|
random_vectors = np.random.randn(self.num_hashes, len(vector))
|
||||||
for _ in range(self.num_layers):
|
return ''.join(['1' if np.dot(random_vectors[i], vector) >= 0 else '0' for i in range(self.num_hashes)])
|
||||||
layer_hashes = []
|
|
||||||
for _ in range(self.num_hashes):
|
|
||||||
random_projection = np.random.randn(self.dimensions)
|
|
||||||
random_projection /= np.linalg.norm(random_projection)
|
|
||||||
layer_hashes.append(random_projection)
|
|
||||||
hash_functions.append(layer_hashes)
|
|
||||||
return hash_functions
|
|
||||||
|
|
||||||
def hash_vector(self, vector):
|
def create_hash_tables(self):
|
||||||
hashed_values = []
|
for layer in range(self.num_layers):
|
||||||
for i in range(self.num_layers):
|
for i, vector in enumerate(self.data):
|
||||||
layer_hashes = self.hash_functions[i]
|
hash_code = self.hash_vector(vector, seed=layer)
|
||||||
layer_hash = [int(np.dot(vector, h) > 0) for h in layer_hashes]
|
self.hash_tables[layer][hash_code].append(i)
|
||||||
hashed_values.append(tuple(layer_hash))
|
|
||||||
return hashed_values
|
|
||||||
|
|
||||||
def add_vector(self, vector, image_id):
|
def find_similar(self, external_image, t, threshold=0.9):
|
||||||
hashed = self.hash_vector(vector)
|
similar_images = set()
|
||||||
for i in range(self.num_layers):
|
visited_buckets = set()
|
||||||
self.index[i][hashed[i]].append((image_id, vector))
|
unique_images_considered = set()
|
||||||
|
|
||||||
def query(self, query_vector):
|
for layer in range(self.num_layers):
|
||||||
hashed_query = self.hash_vector(query_vector)
|
hash_code = self.hash_vector(external_image, seed=layer)
|
||||||
candidates = set()
|
visited_buckets.add(hash_code)
|
||||||
for i in range(self.num_layers):
|
|
||||||
candidates.update(self.index[i][hashed_query[i]])
|
|
||||||
return candidates
|
|
||||||
|
|
||||||
def query_t_unique(self, query_vector, t):
|
for key in self.hash_tables[layer]:
|
||||||
hashed_query = self.hash_vector(query_vector)
|
if key != hash_code and self.hamming_distance(key, hash_code) <= 2:
|
||||||
candidates = []
|
visited_buckets.add(key)
|
||||||
unique_vectors = set() # Track unique vectors considered
|
|
||||||
|
|
||||||
for i in range(self.num_layers):
|
for idx in self.hash_tables[layer][key]:
|
||||||
candidates.extend(self.index[i][hashed_query[i]])
|
similar_images.add(idx)
|
||||||
|
unique_images_considered.add(idx)
|
||||||
|
|
||||||
# Calculate Euclidean distance between query and candidate vectors
|
self.unique_images_considered = unique_images_considered
|
||||||
distances = []
|
self.overall_images_considered = similar_images
|
||||||
for candidate in candidates:
|
|
||||||
unique_vectors.add(tuple(candidate[1])) # Adding vectors to track uniqueness
|
|
||||||
# unique_vectors.add((candidate)) # Adding vectors to track uniqueness
|
|
||||||
distance = np.linalg.norm(candidate[0] - query_vector)
|
|
||||||
distances.append(distance)
|
|
||||||
|
|
||||||
# Sort candidates based on Euclidean distance and get t unique similar vectors
|
similarities = [
|
||||||
unique_similar_vectors = []
|
(idx, self.euclidean_distance(external_image, self.data[idx])) for idx in similar_images
|
||||||
for distance, candidate in sorted(zip(distances, candidates)):
|
]
|
||||||
if len(unique_similar_vectors) >= t:
|
similarities.sort(key=lambda x: x[1])
|
||||||
break
|
|
||||||
if tuple(candidate) not in unique_similar_vectors:
|
|
||||||
unique_similar_vectors.append(tuple(candidate))
|
|
||||||
|
|
||||||
return list(unique_similar_vectors), len(unique_vectors), len(candidates)
|
return [idx for idx, _ in similarities[:t]]
|
||||||
|
|
||||||
|
def hamming_distance(self, code1, code2):
|
||||||
|
return sum(c1 != c2 for c1, c2 in zip(code1, code2))
|
||||||
|
|
||||||
|
def euclidean_distance(self, vector1, vector2):
|
||||||
|
return np.linalg.norm(vector1 - vector2)
|
||||||
|
|
||||||
|
def get_unique_images_considered_count(self):
|
||||||
|
return len(self.unique_images_considered)
|
||||||
|
|
||||||
|
def get_overall_images_considered_count(self):
|
||||||
|
return len(self.overall_images_considered)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user