kmeans impl. (incomplete)

2026-03-11 23:34:05 +00:00 · 2023-10-11 16:10:35 -07:00
parent 3e10271a41
commit 9e05228e94
2 changed files with 1269 additions and 26 deletions
--- a/2/task_3.ipynb
+++ b/2/task_3.ipynb
--- a/2/utils.py
+++ b/2/utils.py
@@ -1,6 +1,7 @@
 # All imports
 # Math
 import math
 import random
 import cv2
 import numpy as np
 from scipy.stats import pearsonr
@@ -8,7 +9,8 @@ from scipy.sparse.linalg import svds
 from sklearn.decomposition import NMF
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from sklearn.cluster import KMeans
+
 # from sklearn.cluster import KMeans
 # Torch
 import torch
@@ -582,6 +584,97 @@ valid_dim_reduction_methods = {
 }
 class KMeans:
    def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.cluster_centers_ = {}
        self.verbose = verbose
    def fit(self, data):
        """Iterative fitting clusters on data of `(n_samples,n_features)` dimensions"""
        # Randomly select centroid start points with uniform distribution from dataset
        min_, max_ = np.min(data, axis=0), np.max(data, axis=0)
        self.cluster_centers_ = {
            i: np.random.uniform(min_, max_) for i in range(self.n_clusters)
        }
        if self.verbose > 0:
            print("Initialized centroids")
        for itr in range(self.max_iter):
            print(f"Iteration {itr}")
            self.clusters = {}
            for j in range(self.n_clusters):
                self.clusters[j] = []
            for feature_set in data:
                # TODO: Should this be modified to use different distance measures
                # based on the feature set?
                distances = [
                    np.linalg.norm(feature_set - self.cluster_centers_[i])
                    for i in range(len(self.cluster_centers_))
                ]
                # Put data point into closest cluster
                cluster = np.argmin(distances)
                self.clusters[cluster].append(feature_set)
            prev_centroids = self.cluster_centers_
            for c in self.cluster_centers_:
                if isinstance(self.cluster_centers_[c], np.ndarray):
                    if np.isnan(self.cluster_centers_[c]).any():
                        # Reinitialize centroid to a random point in the dataset
                        self.cluster_centers_[c] = np.random.uniform(min_, max_)
                    else:
                        # Compute the mean of non-empty cluster
                        self.cluster_centers_[c] = np.mean(self.clusters[c], axis=0)
                elif np.isnan(self.cluster_centers_[c]):
                    # Reinitialize centroid to a random point in the dataset
                    self.cluster_centers_[c] = np.random.uniform(min_, max_)
            # Check if centroids have converged
            optimized = True
            for c in self.cluster_centers_:
                prev_centroid = prev_centroids[c]
                current_centroid = self.cluster_centers_[c]
                convergence_tol = np.sum(abs(
                    (prev_centroid - current_centroid) / prev_centroid * 100.0
                ))
                if convergence_tol > self.tol:
                    optimized = False
                    if self.verbose > 0:
                        print(f"Iter {itr} - Not converged yet")
                    break
            if itr > 10 and optimized:
                if self.verbose > 0:
                    print(f"Iter {itr} - Converged")
                break
        return self
    def transform(self, data):
        """Transform data of `(n_samples,n_features)` dimensions to `(n_samples,n_clusters)` using fitted model"""
        Y = np.empty((len(data), self.n_clusters))
        for idx, feature_set in enumerate(data):
            # TODO: Could this be modified to use different distance measures
            # based on the feature set?
            Y[idx] = np.array(
                [
                    np.linalg.norm(feature_set - self.cluster_centers_[i])
                    for i in range(len(self.cluster_centers_))
                ]
            )
        return Y
 def extract_latent_semantics(
    fd_collection, k, feature_model, dim_reduction_method, top_images=None
 ):
@@ -659,7 +752,10 @@ def extract_latent_semantics(
            W = model.transform(feature_vectors_shifted)
            H = model.components_
-            all_latent_semantics = {"image-semantic": W, "semantic-feature": H}
+            all_latent_semantics = {
                "image-semantic": W.tolist(),
                "semantic-feature": H.tolist(),
            }
            # for each latent semantic, sort imageID-weight pairs by weights in descending order
            displayed_latent_semantics = [
@@ -689,7 +785,10 @@ def extract_latent_semantics(
            # X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
            X = model.transform(feature_vectors_shifted)
-            all_latent_semantics = {"image-semantic": X, "semantic-feature": K}
+            all_latent_semantics = {
                "image-semantic": X.tolist(),
                "semantic-feature": K.tolist(),
            }
            # for each latent semantic, sort imageID-weight pairs by weights in descending order
            displayed_latent_semantics = [
@@ -703,11 +802,24 @@ def extract_latent_semantics(
        # k-means clustering to reduce to k clusters/dimensions
        case 4:
-            model = KMeans(n_clusters=k).fit(feature_vectors)
+            model = KMeans(n_clusters=k, verbose=2).fit(feature_vectors)
            CC = model.cluster_centers_
-            U = model.transform(feature_vectors)
+            Y = model.transform(feature_vectors)
-            all_latent_semantics = {"image-semantic": U, "semantic_feature": CC}
+            all_latent_semantics = {
                "image-semantic": Y.tolist(),
                "semantic-feature": list(CC.values()),
            }
            # for each latent semantic, sort imageID-weight pairs by weights in descending order
            displayed_latent_semantics = [
                sorted(
                    list(zip(feature_ids, latent_semantic)),
                    key=lambda x: x[1],
                    reverse=False,
                )[:top_images]
                for latent_semantic in Y.T
            ]
    for idx, latent_semantic in enumerate(displayed_latent_semantics):
        print(f"Latent semantic no. {idx}")