mirror of
https://github.com/20kaushik02/CSE515_MWDB_Project.git
synced 2025-12-06 12:04:07 +00:00
kmeans impl. (incomplete)
This commit is contained in:
parent
3e10271a41
commit
9e05228e94
1171
Phase 2/task_3.ipynb
1171
Phase 2/task_3.ipynb
File diff suppressed because it is too large
Load Diff
124
Phase 2/utils.py
124
Phase 2/utils.py
@ -1,6 +1,7 @@
|
|||||||
# All imports
|
# All imports
|
||||||
# Math
|
# Math
|
||||||
import math
|
import math
|
||||||
|
import random
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.stats import pearsonr
|
from scipy.stats import pearsonr
|
||||||
@ -8,7 +9,8 @@ from scipy.sparse.linalg import svds
|
|||||||
from sklearn.decomposition import NMF
|
from sklearn.decomposition import NMF
|
||||||
from sklearn.decomposition import LatentDirichletAllocation
|
from sklearn.decomposition import LatentDirichletAllocation
|
||||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
# from sklearn.cluster import KMeans
|
||||||
|
|
||||||
# Torch
|
# Torch
|
||||||
import torch
|
import torch
|
||||||
@ -582,6 +584,97 @@ valid_dim_reduction_methods = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class KMeans:
|
||||||
|
def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0):
|
||||||
|
self.n_clusters = n_clusters
|
||||||
|
self.max_iter = max_iter
|
||||||
|
self.tol = tol
|
||||||
|
self.cluster_centers_ = {}
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
def fit(self, data):
|
||||||
|
"""Iterative fitting clusters on data of `(n_samples,n_features)` dimensions"""
|
||||||
|
|
||||||
|
# Randomly select centroid start points with uniform distribution from dataset
|
||||||
|
min_, max_ = np.min(data, axis=0), np.max(data, axis=0)
|
||||||
|
self.cluster_centers_ = {
|
||||||
|
i: np.random.uniform(min_, max_) for i in range(self.n_clusters)
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.verbose > 0:
|
||||||
|
print("Initialized centroids")
|
||||||
|
for itr in range(self.max_iter):
|
||||||
|
print(f"Iteration {itr}")
|
||||||
|
self.clusters = {}
|
||||||
|
|
||||||
|
for j in range(self.n_clusters):
|
||||||
|
self.clusters[j] = []
|
||||||
|
|
||||||
|
for feature_set in data:
|
||||||
|
# TODO: Should this be modified to use different distance measures
|
||||||
|
# based on the feature set?
|
||||||
|
distances = [
|
||||||
|
np.linalg.norm(feature_set - self.cluster_centers_[i])
|
||||||
|
for i in range(len(self.cluster_centers_))
|
||||||
|
]
|
||||||
|
|
||||||
|
# Put data point into closest cluster
|
||||||
|
cluster = np.argmin(distances)
|
||||||
|
self.clusters[cluster].append(feature_set)
|
||||||
|
|
||||||
|
prev_centroids = self.cluster_centers_
|
||||||
|
|
||||||
|
for c in self.cluster_centers_:
|
||||||
|
if isinstance(self.cluster_centers_[c], np.ndarray):
|
||||||
|
if np.isnan(self.cluster_centers_[c]).any():
|
||||||
|
# Reinitialize centroid to a random point in the dataset
|
||||||
|
self.cluster_centers_[c] = np.random.uniform(min_, max_)
|
||||||
|
else:
|
||||||
|
# Compute the mean of non-empty cluster
|
||||||
|
self.cluster_centers_[c] = np.mean(self.clusters[c], axis=0)
|
||||||
|
elif np.isnan(self.cluster_centers_[c]):
|
||||||
|
# Reinitialize centroid to a random point in the dataset
|
||||||
|
self.cluster_centers_[c] = np.random.uniform(min_, max_)
|
||||||
|
|
||||||
|
# Check if centroids have converged
|
||||||
|
optimized = True
|
||||||
|
for c in self.cluster_centers_:
|
||||||
|
prev_centroid = prev_centroids[c]
|
||||||
|
current_centroid = self.cluster_centers_[c]
|
||||||
|
convergence_tol = np.sum(abs(
|
||||||
|
(prev_centroid - current_centroid) / prev_centroid * 100.0
|
||||||
|
))
|
||||||
|
if convergence_tol > self.tol:
|
||||||
|
optimized = False
|
||||||
|
if self.verbose > 0:
|
||||||
|
print(f"Iter {itr} - Not converged yet")
|
||||||
|
break
|
||||||
|
|
||||||
|
if itr > 10 and optimized:
|
||||||
|
if self.verbose > 0:
|
||||||
|
print(f"Iter {itr} - Converged")
|
||||||
|
break
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, data):
|
||||||
|
"""Transform data of `(n_samples,n_features)` dimensions to `(n_samples,n_clusters)` using fitted model"""
|
||||||
|
|
||||||
|
Y = np.empty((len(data), self.n_clusters))
|
||||||
|
|
||||||
|
for idx, feature_set in enumerate(data):
|
||||||
|
# TODO: Could this be modified to use different distance measures
|
||||||
|
# based on the feature set?
|
||||||
|
Y[idx] = np.array(
|
||||||
|
[
|
||||||
|
np.linalg.norm(feature_set - self.cluster_centers_[i])
|
||||||
|
for i in range(len(self.cluster_centers_))
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return Y
|
||||||
|
|
||||||
|
|
||||||
def extract_latent_semantics(
|
def extract_latent_semantics(
|
||||||
fd_collection, k, feature_model, dim_reduction_method, top_images=None
|
fd_collection, k, feature_model, dim_reduction_method, top_images=None
|
||||||
):
|
):
|
||||||
@ -659,7 +752,10 @@ def extract_latent_semantics(
|
|||||||
W = model.transform(feature_vectors_shifted)
|
W = model.transform(feature_vectors_shifted)
|
||||||
H = model.components_
|
H = model.components_
|
||||||
|
|
||||||
all_latent_semantics = {"image-semantic": W, "semantic-feature": H}
|
all_latent_semantics = {
|
||||||
|
"image-semantic": W.tolist(),
|
||||||
|
"semantic-feature": H.tolist(),
|
||||||
|
}
|
||||||
|
|
||||||
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
||||||
displayed_latent_semantics = [
|
displayed_latent_semantics = [
|
||||||
@ -689,7 +785,10 @@ def extract_latent_semantics(
|
|||||||
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
|
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
|
||||||
X = model.transform(feature_vectors_shifted)
|
X = model.transform(feature_vectors_shifted)
|
||||||
|
|
||||||
all_latent_semantics = {"image-semantic": X, "semantic-feature": K}
|
all_latent_semantics = {
|
||||||
|
"image-semantic": X.tolist(),
|
||||||
|
"semantic-feature": K.tolist(),
|
||||||
|
}
|
||||||
|
|
||||||
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
||||||
displayed_latent_semantics = [
|
displayed_latent_semantics = [
|
||||||
@ -703,11 +802,24 @@ def extract_latent_semantics(
|
|||||||
|
|
||||||
# k-means clustering to reduce to k clusters/dimensions
|
# k-means clustering to reduce to k clusters/dimensions
|
||||||
case 4:
|
case 4:
|
||||||
model = KMeans(n_clusters=k).fit(feature_vectors)
|
model = KMeans(n_clusters=k, verbose=2).fit(feature_vectors)
|
||||||
CC = model.cluster_centers_
|
CC = model.cluster_centers_
|
||||||
U = model.transform(feature_vectors)
|
Y = model.transform(feature_vectors)
|
||||||
|
|
||||||
all_latent_semantics = {"image-semantic": U, "semantic_feature": CC}
|
all_latent_semantics = {
|
||||||
|
"image-semantic": Y.tolist(),
|
||||||
|
"semantic-feature": list(CC.values()),
|
||||||
|
}
|
||||||
|
|
||||||
|
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
||||||
|
displayed_latent_semantics = [
|
||||||
|
sorted(
|
||||||
|
list(zip(feature_ids, latent_semantic)),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=False,
|
||||||
|
)[:top_images]
|
||||||
|
for latent_semantic in Y.T
|
||||||
|
]
|
||||||
|
|
||||||
for idx, latent_semantic in enumerate(displayed_latent_semantics):
|
for idx, latent_semantic in enumerate(displayed_latent_semantics):
|
||||||
print(f"Latent semantic no. {idx}")
|
print(f"Latent semantic no. {idx}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user