In [1]:
from utils import *
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.decomposition import PCA
from statistics import mode
from sklearn.metrics import precision_recall_fscore_support

In [2]:
fd_collection = getCollection("team_5_mwdb_phase_2", "fd_collection")

In [3]:
selected_feature_model = "fc_fd"

m = int(input("Enter value of m: "))
if m < 1:
    raise ValueError("m should be a positive integer")

classification_method = str(
    input(
        "Enter classification method - one of "
        + str(list(valid_classification_methods.keys()))
    )
)

In [4]:
all_images = list(fd_collection.find())
all_images = sorted(all_images, key = lambda x: x["image_id"])

odd_image_ids = [img["image_id"] for img in all_images if img["image_id"] % 2 == 0]

even_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 == 0]
odd_image_labels = [img["true_label"] for img in all_images if img["image_id"] % 2 != 0]

feature_vectors = [np.array(img[selected_feature_model]).flatten() for img in all_images]

pca = PCA(n_components=150)
feature_vectors = pca.fit_transform(feature_vectors)

total_len = len(feature_vectors)
even_feature_vectors = []
odd_feature_vectors = []

for i in range(total_len):
  if i % 2 == 0:
    even_feature_vectors.append(feature_vectors[i])
  else:
    odd_feature_vectors.append(feature_vectors[i])

even_feature_vectors = np.array(even_feature_vectors)
odd_feature_vectors = np.array(odd_feature_vectors)

odd_len = odd_feature_vectors.shape[0]
even_len = even_feature_vectors.shape[0]

In [5]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    
    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities))
    
    def information_gain(self, X, y, feature, threshold):
        left_idxs = X[:, feature] <= threshold
        right_idxs = ~left_idxs
        
        left_y = y[left_idxs]
        right_y = y[right_idxs]
        
        p_left = len(left_y) / len(y)
        p_right = len(right_y) / len(y)
        
        gain = self.entropy(y) - (p_left * self.entropy(left_y) + p_right * self.entropy(right_y))
        return gain
    
    def find_best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self.information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold
    
    def build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or depth == self.max_depth:
            return Node(value=np.argmax(np.bincount(y)))
        
        best_feature, best_threshold = self.find_best_split(X, y)
        
        if best_feature is None:
            return Node(value=np.argmax(np.bincount(y)))
        
        left_idxs = X[:, best_feature] <= best_threshold
        right_idxs = ~left_idxs
        
        left_subtree = self.build_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_subtree = self.build_tree(X[right_idxs], y[right_idxs], depth + 1)
        
        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)
    
    def fit(self, X, y):
        X = np.array(X)  # Convert to NumPy array
        y = np.array(y)  # Convert to NumPy array
        self.tree = self.build_tree(X, y)
    
    def predict_instance(self, x, node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self.predict_instance(x, node.left)
        else:
            return self.predict_instance(x, node.right)
    
    def predict(self, X):
        X = np.array(X)  # Convert to NumPy array
        predictions = []
        for x in X:
            pred = self.predict_instance(x, self.tree)
            predictions.append(pred)
        return np.array(predictions)

In [7]:
match valid_classification_methods[classification_method]:

    case 1:

        predictions = []

        for i in range(odd_len):

            distances = []

            for j in range(even_len):
                distances.append({"label": even_image_labels[j], "distance": euclidean_distance_measure(odd_feature_vectors[i], even_feature_vectors[j])})
            
            sorted_distances = sorted(distances, key=lambda x: x['distance'])[:m]

            labels = [sd["label"] for sd in sorted_distances]
            
            pred = max(set(labels), key = labels.count)

            predictions.append(pred)

            print(f"Image ID: {i * 2 + 1} is similar to {pred}")


    case 2:

        tree = DecisionTree(max_depth=7)
        tree.fit(even_feature_vectors, even_image_labels)

        print("Tree formed")

        predictions = tree.predict(odd_feature_vectors)

        pred_len = len(predictions)

        for i in range(pred_len):
            print(f"Image ID: {i * 2 + 1} is similar to {predictions[i]}")

Image ID: 1 is similar to 0
Image ID: 3 is similar to 0
Image ID: 5 is similar to 0
Image ID: 7 is similar to 0
Image ID: 9 is similar to 0
Image ID: 11 is similar to 1
Image ID: 13 is similar to 0
Image ID: 15 is similar to 0
Image ID: 17 is similar to 0
Image ID: 19 is similar to 0
Image ID: 21 is similar to 0
Image ID: 23 is similar to 1
Image ID: 25 is similar to 0
Image ID: 27 is similar to 0
Image ID: 29 is similar to 0
Image ID: 31 is similar to 0
Image ID: 33 is similar to 1
Image ID: 35 is similar to 0
Image ID: 37 is similar to 0
Image ID: 39 is similar to 0
Image ID: 41 is similar to 0
Image ID: 43 is similar to 0
Image ID: 45 is similar to 0
Image ID: 47 is similar to 0
Image ID: 49 is similar to 0
Image ID: 51 is similar to 0
Image ID: 53 is similar to 0
Image ID: 55 is similar to 0
Image ID: 57 is similar to 1
Image ID: 59 is similar to 0
Image ID: 61 is similar to 1
Image ID: 63 is similar to 0
Image ID: 65 is similar to 1
Image ID: 67 is similar to 0
Image ID: 69 is sim