save decision tree

This commit is contained in:
pranavbrkr 2023-11-26 16:31:27 -07:00
parent 1b1dc90236
commit a10336e5af
5 changed files with 2579 additions and 2359 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

247
Phase 3/task_5.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,8 @@ import cv2
import numpy as np import numpy as np
from scipy.stats import pearsonr from scipy.stats import pearsonr
from collections import defaultdict
# from scipy.sparse.linalg import svds # from scipy.sparse.linalg import svds
# from sklearn.decomposition import NMF # from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition import LatentDirichletAllocation
@ -20,6 +22,8 @@ from torchvision.models import resnet50, ResNet50_Weights
import tensorly as tl import tensorly as tl
import heapq
# OS and env # OS and env
import json import json
import os import os
@ -66,3 +70,155 @@ valid_feature_models = {
"fc": "fc_fd", "fc": "fc_fd",
"resnet": "resnet_fd", "resnet": "resnet_fd",
} }
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.value = value
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
self.tree = None
def entropy(self, y):
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities))
def information_gain(self, X, y, feature, threshold):
left_idxs = X[:, feature] <= threshold
right_idxs = ~left_idxs
left_y = y[left_idxs]
right_y = y[right_idxs]
p_left = len(left_y) / len(y)
p_right = len(right_y) / len(y)
gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
return gain
def find_best_split(self, X, y):
best_gain = 0
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self.information_gain(X, y, feature, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def build_tree(self, X, y, depth=0):
if len(np.unique(y)) == 1 or depth == self.max_depth:
return Node(value=np.argmax(np.bincount(y)))
best_feature, best_threshold = self.find_best_split(X, y)
if best_feature is None:
return Node(value=np.argmax(np.bincount(y)))
left_idxs = X[:, best_feature] <= best_threshold
right_idxs = ~left_idxs
left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
def fit(self, X, y):
X = np.array(X) # Convert to NumPy array
y = np.array(y) # Convert to NumPy array
self.tree = self.build_tree(X, y)
def predict_instance(self, x, node):
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self.predict_instance(x, node.left)
else:
return self.predict_instance(x, node.right)
def predict(self, X):
X = np.array(X) # Convert to NumPy array
predictions = []
for x in X:
pred = self.predict_instance(x, self.tree)
predictions.append(pred)
return np.array(predictions)
class LSHIndex:
def __init__(self, num_layers, num_hashes, dimensions, seed=42):
self.num_layers = num_layers
self.num_hashes = num_hashes
self.dimensions = dimensions
self.index = [defaultdict(list) for _ in range(num_layers)]
self.hash_functions = self._generate_hash_functions(seed)
def _generate_hash_functions(self, seed):
np.random.seed(seed)
hash_functions = []
for _ in range(self.num_layers):
layer_hashes = []
for _ in range(self.num_hashes):
random_projection = np.random.randn(self.dimensions)
random_projection /= np.linalg.norm(random_projection)
layer_hashes.append(random_projection)
hash_functions.append(layer_hashes)
return hash_functions
def hash_vector(self, vector):
hashed_values = []
for i in range(self.num_layers):
layer_hashes = self.hash_functions[i]
layer_hash = [int(np.dot(vector, h) > 0) for h in layer_hashes]
hashed_values.append(tuple(layer_hash))
return hashed_values
def add_vector(self, vector, image_id):
hashed = self.hash_vector(vector)
for i in range(self.num_layers):
self.index[i][hashed[i]].append((image_id, vector))
def query(self, query_vector):
hashed_query = self.hash_vector(query_vector)
candidates = set()
for i in range(self.num_layers):
candidates.update(self.index[i][hashed_query[i]])
return candidates
def query_t_unique(self, query_vector, t):
hashed_query = self.hash_vector(query_vector)
candidates = []
unique_vectors = set() # Track unique vectors considered
for i in range(self.num_layers):
candidates.extend(self.index[i][hashed_query[i]])
# Calculate Euclidean distance between query and candidate vectors
distances = []
for candidate in candidates:
unique_vectors.add(tuple(candidate[1])) # Adding vectors to track uniqueness
# unique_vectors.add((candidate)) # Adding vectors to track uniqueness
distance = np.linalg.norm(candidate[0] - query_vector)
distances.append(distance)
# Sort candidates based on Euclidean distance and get t unique similar vectors
unique_similar_vectors = []
for distance, candidate in sorted(zip(distances, candidates)):
if len(unique_similar_vectors) >= t:
break
if tuple(candidate) not in unique_similar_vectors:
unique_similar_vectors.append(tuple(candidate))
return list(unique_similar_vectors), len(unique_vectors), len(candidates)