fixed kmeans

This commit is contained in:
Kaushik Narayan R 2023-10-12 18:35:50 -07:00
parent 906ca102ae
commit aa8184fcfc
2 changed files with 145 additions and 163 deletions

View File

@ -29,117 +29,120 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Applying svd on the cm_fd space to get 10 latent semantics (showing only top 10 image-weight pairs for each latent semantic)...\n", "Applying kmeans on the resnet_fd space to get 10 latent semantics (showing only top 10 image-weight pairs for each latent semantic)...\n",
"Initialized centroids\n",
"Iteration 56 - Converged\n",
"Note: for K-Means we display distances, in ascending order\n",
"Latent semantic no. 0\n", "Latent semantic no. 0\n",
"Image_ID\t7654\t-\tWeight\t0.0816218927496473\n", "Image_ID\t440\t-\tDistance\t10.640763416796371\n",
"Image_ID\t8634\t-\tWeight\t0.0667358948577843\n", "Image_ID\t700\t-\tDistance\t11.159224514655602\n",
"Image_ID\t5740\t-\tWeight\t0.06005882120197204\n", "Image_ID\t654\t-\tDistance\t11.395135539610168\n",
"Image_ID\t6106\t-\tWeight\t0.0530666139393161\n", "Image_ID\t486\t-\tDistance\t11.550858382118225\n",
"Image_ID\t5456\t-\tWeight\t0.051701715703308504\n", "Image_ID\t462\t-\tDistance\t11.61044182679253\n",
"Image_ID\t7814\t-\tWeight\t0.04997978865116192\n", "Image_ID\t652\t-\tDistance\t11.818427599783789\n",
"Image_ID\t6248\t-\tWeight\t0.04946683639815059\n", "Image_ID\t676\t-\tDistance\t11.925768133017636\n",
"Image_ID\t5354\t-\tWeight\t0.04864381025793159\n", "Image_ID\t584\t-\tDistance\t11.93319861884516\n",
"Image_ID\t6108\t-\tWeight\t0.0479676393433854\n", "Image_ID\t692\t-\tDistance\t11.979693069110743\n",
"Image_ID\t5438\t-\tWeight\t0.04787474760068962\n", "Image_ID\t6\t-\tDistance\t12.137562566975056\n",
"Latent semantic no. 1\n", "Latent semantic no. 1\n",
"Image_ID\t7654\t-\tWeight\t0.05566187740909836\n", "Image_ID\t3602\t-\tDistance\t13.563162479981145\n",
"Image_ID\t7880\t-\tWeight\t0.05304265128270742\n", "Image_ID\t2414\t-\tDistance\t14.192224338224467\n",
"Image_ID\t5132\t-\tWeight\t0.052802620405367526\n", "Image_ID\t3560\t-\tDistance\t14.205420291205272\n",
"Image_ID\t4516\t-\tWeight\t0.05032667794065215\n", "Image_ID\t3600\t-\tDistance\t14.389262503144405\n",
"Image_ID\t3064\t-\tWeight\t0.04996389545581616\n", "Image_ID\t2228\t-\tDistance\t14.4828087393621\n",
"Image_ID\t7808\t-\tWeight\t0.04885211523705829\n", "Image_ID\t3636\t-\tDistance\t14.497503774497243\n",
"Image_ID\t8102\t-\tWeight\t0.04821048869059779\n", "Image_ID\t3614\t-\tDistance\t14.591251785931954\n",
"Image_ID\t5336\t-\tWeight\t0.047392911537133244\n", "Image_ID\t2090\t-\tDistance\t14.620114150279178\n",
"Image_ID\t3058\t-\tWeight\t0.04622961181395915\n", "Image_ID\t2328\t-\tDistance\t14.69159730598465\n",
"Image_ID\t7484\t-\tWeight\t0.04563242634411927\n", "Image_ID\t2448\t-\tDistance\t14.774950728597261\n",
"Latent semantic no. 2\n", "Latent semantic no. 2\n",
"Image_ID\t7654\t-\tWeight\t0.07046701663277787\n", "Image_ID\t4838\t-\tDistance\t12.261260721990451\n",
"Image_ID\t2804\t-\tWeight\t0.059682344110995336\n", "Image_ID\t7302\t-\tDistance\t12.880136852617754\n",
"Image_ID\t2710\t-\tWeight\t0.05919911159809061\n", "Image_ID\t7978\t-\tDistance\t13.077993711608961\n",
"Image_ID\t3436\t-\tWeight\t0.05368202357324448\n", "Image_ID\t8600\t-\tDistance\t13.305290839761437\n",
"Image_ID\t7936\t-\tWeight\t0.05327699149689366\n", "Image_ID\t7292\t-\tDistance\t13.334716062864114\n",
"Image_ID\t2708\t-\tWeight\t0.04852701979500758\n", "Image_ID\t7720\t-\tDistance\t13.37155798887382\n",
"Image_ID\t3764\t-\tWeight\t0.04835537239641772\n", "Image_ID\t7958\t-\tDistance\t13.430323190148206\n",
"Image_ID\t7928\t-\tWeight\t0.04799898902425922\n", "Image_ID\t4600\t-\tDistance\t13.45781162474979\n",
"Image_ID\t5684\t-\tWeight\t0.04723047448150721\n", "Image_ID\t4270\t-\tDistance\t13.491427681265899\n",
"Image_ID\t5126\t-\tWeight\t0.04720498270016626\n", "Image_ID\t4828\t-\tDistance\t13.539053205319615\n",
"Latent semantic no. 3\n", "Latent semantic no. 3\n",
"Image_ID\t8654\t-\tWeight\t0.08668332932816088\n", "Image_ID\t1758\t-\tDistance\t5.030040634300718\n",
"Image_ID\t8618\t-\tWeight\t0.08568859853566119\n", "Image_ID\t1562\t-\tDistance\t5.3329050871004755\n",
"Image_ID\t8658\t-\tWeight\t0.0777605087520117\n", "Image_ID\t1586\t-\tDistance\t5.583507266395663\n",
"Image_ID\t3306\t-\tWeight\t0.0745220591779124\n", "Image_ID\t1362\t-\tDistance\t6.017196001905923\n",
"Image_ID\t8620\t-\tWeight\t0.07351843281590886\n", "Image_ID\t1626\t-\tDistance\t6.045998053427588\n",
"Image_ID\t8638\t-\tWeight\t0.06948884666766826\n", "Image_ID\t1208\t-\tDistance\t6.051540458349612\n",
"Image_ID\t6754\t-\tWeight\t0.06896434951935482\n", "Image_ID\t1374\t-\tDistance\t6.178242313742901\n",
"Image_ID\t8676\t-\tWeight\t0.06623938393792103\n", "Image_ID\t1112\t-\tDistance\t6.249956790411116\n",
"Image_ID\t4650\t-\tWeight\t0.06566930583744507\n", "Image_ID\t1710\t-\tDistance\t6.310688634541122\n",
"Image_ID\t8636\t-\tWeight\t0.06499098805246775\n", "Image_ID\t1490\t-\tDistance\t6.376123320547912\n",
"Latent semantic no. 4\n", "Latent semantic no. 4\n",
"Image_ID\t7370\t-\tWeight\t0.05281026462494081\n", "Image_ID\t8282\t-\tDistance\t10.506907762007522\n",
"Image_ID\t6528\t-\tWeight\t0.05252803707219361\n", "Image_ID\t8348\t-\tDistance\t10.647963471647738\n",
"Image_ID\t8056\t-\tWeight\t0.0517501956788071\n", "Image_ID\t8380\t-\tDistance\t10.715093501411761\n",
"Image_ID\t2958\t-\tWeight\t0.051231189117377514\n", "Image_ID\t8228\t-\tDistance\t10.879515968086416\n",
"Image_ID\t4614\t-\tWeight\t0.05061302210733084\n", "Image_ID\t8240\t-\tDistance\t10.896279105885796\n",
"Image_ID\t8292\t-\tWeight\t0.05000577057549516\n", "Image_ID\t8340\t-\tDistance\t10.952943877775777\n",
"Image_ID\t7888\t-\tWeight\t0.04905059301012733\n", "Image_ID\t8174\t-\tDistance\t11.012538653878869\n",
"Image_ID\t6540\t-\tWeight\t0.048139958875035006\n", "Image_ID\t8368\t-\tDistance\t11.01584931675634\n",
"Image_ID\t6064\t-\tWeight\t0.04605896293857509\n", "Image_ID\t8176\t-\tDistance\t11.074708303511043\n",
"Image_ID\t2974\t-\tWeight\t0.04488429099909442\n", "Image_ID\t8386\t-\tDistance\t11.090905861600216\n",
"Latent semantic no. 5\n", "Latent semantic no. 5\n",
"Image_ID\t8570\t-\tWeight\t0.08379938013632153\n", "Image_ID\t7400\t-\tDistance\t9.07340282234228\n",
"Image_ID\t7784\t-\tWeight\t0.07238472588049127\n", "Image_ID\t7332\t-\tDistance\t9.27997555888011\n",
"Image_ID\t4152\t-\tWeight\t0.06076922471976642\n", "Image_ID\t6626\t-\tDistance\t9.490015364667478\n",
"Image_ID\t5114\t-\tWeight\t0.05387212151769057\n", "Image_ID\t7990\t-\tDistance\t9.619812101313876\n",
"Image_ID\t7774\t-\tWeight\t0.05324887247524\n", "Image_ID\t7392\t-\tDistance\t9.640980435311661\n",
"Image_ID\t8614\t-\tWeight\t0.05319742868629018\n", "Image_ID\t7404\t-\tDistance\t9.6738734363643\n",
"Image_ID\t3072\t-\tWeight\t0.05083994521792827\n", "Image_ID\t7980\t-\tDistance\t9.710518881249477\n",
"Image_ID\t7798\t-\tWeight\t0.050598074135949\n", "Image_ID\t7410\t-\tDistance\t9.778693486707565\n",
"Image_ID\t5118\t-\tWeight\t0.05022770477320978\n", "Image_ID\t7950\t-\tDistance\t9.785247539262517\n",
"Image_ID\t7040\t-\tWeight\t0.04996996742218058\n", "Image_ID\t7346\t-\tDistance\t9.806294880503\n",
"Latent semantic no. 6\n", "Latent semantic no. 6\n",
"Image_ID\t8570\t-\tWeight\t0.07082421149695753\n", "Image_ID\t8542\t-\tDistance\t11.232961895055158\n",
"Image_ID\t7774\t-\tWeight\t0.06546594547486784\n", "Image_ID\t6014\t-\tDistance\t11.304802835945505\n",
"Image_ID\t4152\t-\tWeight\t0.06440870014673937\n", "Image_ID\t8566\t-\tDistance\t11.443919577851908\n",
"Image_ID\t5118\t-\tWeight\t0.06264436903974217\n", "Image_ID\t7200\t-\tDistance\t11.484387898391537\n",
"Image_ID\t7784\t-\tWeight\t0.06203552824772957\n", "Image_ID\t6626\t-\tDistance\t11.48886846539337\n",
"Image_ID\t7798\t-\tWeight\t0.05899354962287138\n", "Image_ID\t6620\t-\tDistance\t11.578369802598303\n",
"Image_ID\t7896\t-\tWeight\t0.056484444935709706\n", "Image_ID\t6636\t-\tDistance\t11.662783932711658\n",
"Image_ID\t7766\t-\tWeight\t0.056063042928801675\n", "Image_ID\t8056\t-\tDistance\t11.74943673802499\n",
"Image_ID\t7792\t-\tWeight\t0.05557880301849769\n", "Image_ID\t7700\t-\tDistance\t11.769992973787971\n",
"Image_ID\t7834\t-\tWeight\t0.05556750918330256\n", "Image_ID\t6622\t-\tDistance\t11.780162710805048\n",
"Latent semantic no. 7\n", "Latent semantic no. 7\n",
"Image_ID\t1140\t-\tWeight\t0.05317423066517462\n", "Image_ID\t2646\t-\tDistance\t7.514711553618432\n",
"Image_ID\t5510\t-\tWeight\t0.052651188836683724\n", "Image_ID\t2260\t-\tDistance\t7.633993639248322\n",
"Image_ID\t5282\t-\tWeight\t0.05122146559887229\n", "Image_ID\t2460\t-\tDistance\t7.685809907469392\n",
"Image_ID\t1260\t-\tWeight\t0.050478632782130786\n", "Image_ID\t2660\t-\tDistance\t7.701780256364207\n",
"Image_ID\t1692\t-\tWeight\t0.05043911725770527\n", "Image_ID\t2418\t-\tDistance\t7.716363257255012\n",
"Image_ID\t8656\t-\tWeight\t0.04943228673655803\n", "Image_ID\t2240\t-\tDistance\t7.74734521250179\n",
"Image_ID\t1242\t-\tWeight\t0.04886689682608001\n", "Image_ID\t2430\t-\tDistance\t7.784825198465868\n",
"Image_ID\t7844\t-\tWeight\t0.048768495445578465\n", "Image_ID\t2264\t-\tDistance\t7.828411523843045\n",
"Image_ID\t5100\t-\tWeight\t0.04867702517715619\n", "Image_ID\t2242\t-\tDistance\t7.878806112518542\n",
"Image_ID\t5300\t-\tWeight\t0.048353062438932816\n", "Image_ID\t2196\t-\tDistance\t7.918897962650677\n",
"Latent semantic no. 8\n", "Latent semantic no. 8\n",
"Image_ID\t1798\t-\tWeight\t0.0458641229121734\n", "Image_ID\t562\t-\tDistance\t8.552732623243445\n",
"Image_ID\t1802\t-\tWeight\t0.044772142290101194\n", "Image_ID\t796\t-\tDistance\t9.316343355329956\n",
"Image_ID\t1806\t-\tWeight\t0.044448676280621935\n", "Image_ID\t612\t-\tDistance\t9.451362646413244\n",
"Image_ID\t1202\t-\tWeight\t0.043679466488681894\n", "Image_ID\t476\t-\tDistance\t9.458717454426738\n",
"Image_ID\t1786\t-\tWeight\t0.043513712296368134\n", "Image_ID\t798\t-\tDistance\t9.853412912988212\n",
"Image_ID\t1784\t-\tWeight\t0.043467657416343425\n", "Image_ID\t460\t-\tDistance\t9.859458462429464\n",
"Image_ID\t1790\t-\tWeight\t0.04288750664761759\n", "Image_ID\t190\t-\tDistance\t10.065071186269668\n",
"Image_ID\t1642\t-\tWeight\t0.041863484069841764\n", "Image_ID\t462\t-\tDistance\t10.065893471754435\n",
"Image_ID\t1788\t-\tWeight\t0.04089406629514224\n", "Image_ID\t456\t-\tDistance\t10.099056881970604\n",
"Image_ID\t1796\t-\tWeight\t0.04068815222347914\n", "Image_ID\t828\t-\tDistance\t10.29276769283984\n",
"Latent semantic no. 9\n", "Latent semantic no. 9\n",
"Image_ID\t8616\t-\tWeight\t-0.001110683188398373\n", "Image_ID\t3124\t-\tDistance\t12.500361886870435\n",
"Image_ID\t5234\t-\tWeight\t-0.001470742377963864\n", "Image_ID\t8064\t-\tDistance\t12.967833703429173\n",
"Image_ID\t3838\t-\tWeight\t-0.0018268938101953923\n", "Image_ID\t4270\t-\tDistance\t13.225230811650766\n",
"Image_ID\t7428\t-\tWeight\t-0.001978912864613778\n", "Image_ID\t7720\t-\tDistance\t13.340802785257075\n",
"Image_ID\t4664\t-\tWeight\t-0.0020551982165007863\n", "Image_ID\t8050\t-\tDistance\t13.601572206798334\n",
"Image_ID\t2754\t-\tWeight\t-0.002091620047637018\n", "Image_ID\t8074\t-\tDistance\t13.693355761074226\n",
"Image_ID\t2806\t-\tWeight\t-0.0021702921217260757\n", "Image_ID\t8042\t-\tDistance\t13.72102497292387\n",
"Image_ID\t3820\t-\tWeight\t-0.002247214027498397\n", "Image_ID\t6450\t-\tDistance\t13.750626256669166\n",
"Image_ID\t3786\t-\tWeight\t-0.002360567100195792\n", "Image_ID\t8018\t-\tDistance\t13.768703250806348\n",
"Image_ID\t4928\t-\tWeight\t-0.002395118791388935\n" "Image_ID\t6628\t-\tDistance\t13.784107713433421\n"
] ]
} }
], ],

View File

@ -712,94 +712,63 @@ valid_dim_reduction_methods = {
"kmeans": 4, "kmeans": 4,
} }
class KMeans: class KMeans:
def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0): def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0):
self.n_clusters = n_clusters self.n_clusters = n_clusters
self.max_iter = max_iter self.max_iter = max_iter
self.tol = tol self.tol = tol
self.cluster_centers_ = {} self.cluster_centers_ = None
self.verbose = verbose self.verbose = verbose
def fit(self, data): def _initialize_centroids(self, data):
"""Iterative fitting clusters on data of `(n_samples,n_features)` dimensions""" random_indices = np.random.choice(data.shape[0], self.n_clusters, replace=False)
self.cluster_centers_ = data[random_indices]
# Randomly select centroid start points with uniform distribution from dataset def fit(self, data):
min_, max_ = np.min(data, axis=0), np.max(data, axis=0) data = np.array(data)
self.cluster_centers_ = { self._initialize_centroids(data)
i: np.random.uniform(min_, max_) for i in range(self.n_clusters)
}
if self.verbose > 0: if self.verbose > 0:
print("Initialized centroids") print("Initialized centroids")
for itr in range(self.max_iter):
print(f"Iteration {itr}")
self.clusters = {}
for j in range(self.n_clusters): for itr in range(self.max_iter):
self.clusters[j] = [] clusters = {j: [] for j in range(self.n_clusters)}
for feature_set in data: for feature_set in data:
# TODO: Should this be modified to use different distance measures distances = np.linalg.norm(feature_set - self.cluster_centers_, axis=1)
# based on the feature set?
distances = [
np.linalg.norm(feature_set - self.cluster_centers_[i])
for i in range(len(self.cluster_centers_))
]
# Put data point into closest cluster
cluster = np.argmin(distances) cluster = np.argmin(distances)
self.clusters[cluster].append(feature_set) clusters[cluster].append(feature_set)
prev_centroids = self.cluster_centers_ prev_centroids = np.copy(self.cluster_centers_)
for c in self.cluster_centers_: for c in range(self.n_clusters):
if isinstance(self.cluster_centers_[c], np.ndarray): if len(clusters[c]) > 0:
if np.isnan(self.cluster_centers_[c]).any(): self.cluster_centers_[c] = np.mean(clusters[c], axis=0)
# Reinitialize centroid to a random point in the dataset
self.cluster_centers_[c] = np.random.uniform(min_, max_)
else: else:
# Compute the mean of non-empty cluster
self.cluster_centers_[c] = np.mean(self.clusters[c], axis=0)
elif np.isnan(self.cluster_centers_[c]):
# Reinitialize centroid to a random point in the dataset # Reinitialize centroid to a random point in the dataset
self.cluster_centers_[c] = np.random.uniform(min_, max_) random_index = np.random.choice(data.shape[0])
self.cluster_centers_[c] = data[random_index]
# Check if centroids have converged # Check if centroids have converged
optimized = True
for c in self.cluster_centers_:
prev_centroid = prev_centroids[c]
current_centroid = self.cluster_centers_[c]
convergence_tol = np.sum( convergence_tol = np.sum(
abs((prev_centroid - current_centroid) / prev_centroid * 100.0) np.abs((prev_centroids - self.cluster_centers_) / prev_centroids)
) )
if convergence_tol > self.tol: if convergence_tol < self.tol:
optimized = False
if self.verbose > 0: if self.verbose > 0:
print(f"Iter {itr} - Not converged yet") print(f"Iteration {itr} - Converged")
break
if itr > 10 and optimized:
if self.verbose > 0:
print(f"Iter {itr} - Converged")
break break
return self return self
def transform(self, data): def transform(self, data):
"""Transform data of `(n_samples,n_features)` dimensions to `(n_samples,n_clusters)` using fitted model""" if self.cluster_centers_ is None:
raise ValueError("Fit the model first using the 'fit' method.")
Y = np.empty((len(data), self.n_clusters)) data = np.array(data)
Y = np.empty((data.shape[0], self.n_clusters))
for idx, feature_set in enumerate(data): for idx, feature_set in enumerate(data):
# TODO: Could this be modified to use different distance measures Y[idx] = np.linalg.norm(feature_set - self.cluster_centers_, axis=1)
# based on the feature set?
Y[idx] = np.array(
[
np.linalg.norm(feature_set - self.cluster_centers_[i])
for i in range(len(self.cluster_centers_))
]
)
return Y return Y
@ -943,7 +912,7 @@ def extract_latent_semantics_from_feature_model(
all_latent_semantics = { all_latent_semantics = {
"image-semantic": Y.tolist(), "image-semantic": Y.tolist(),
"semantic-feature": list(CC.values()), "semantic-feature": CC.tolist(),
} }
# for each latent semantic, sort imageID-weight pairs by weights in descending order # for each latent semantic, sort imageID-weight pairs by weights in descending order
@ -956,9 +925,14 @@ def extract_latent_semantics_from_feature_model(
for latent_semantic in Y.T for latent_semantic in Y.T
] ]
if valid_dim_reduction_methods[dim_reduction_method] == 4:
print("Note: for K-Means we display distances, in ascending order")
for idx, latent_semantic in enumerate(displayed_latent_semantics): for idx, latent_semantic in enumerate(displayed_latent_semantics):
print(f"Latent semantic no. {idx}") print(f"Latent semantic no. {idx}")
for image_id, weight in latent_semantic: for image_id, weight in latent_semantic:
if valid_dim_reduction_methods[dim_reduction_method] == 4:
print(f"Image_ID\t{image_id}\t-\tDistance\t{weight}")
else:
print(f"Image_ID\t{image_id}\t-\tWeight\t{weight}") print(f"Image_ID\t{image_id}\t-\tWeight\t{weight}")
with open( with open(
@ -1123,9 +1097,14 @@ def extract_latent_semantics_from_sim_matrix(
for latent_semantic in Y.T for latent_semantic in Y.T
] ]
if valid_dim_reduction_methods[dim_reduction_method] == 4:
print("Note: for K-Means we display distances, in ascending order")
for idx, latent_semantic in enumerate(displayed_latent_semantics): for idx, latent_semantic in enumerate(displayed_latent_semantics):
print(f"Latent semantic no. {idx}") print(f"Latent semantic no. {idx}")
for obj_id, weight in latent_semantic: for obj_id, weight in latent_semantic:
if valid_dim_reduction_methods[dim_reduction_method] == 4:
print(f"{sim_type}\t{obj_id}\t-\tDistance\t{weight}")
else:
print(f"{sim_type}\t{obj_id}\t-\tWeight\t{weight}") print(f"{sim_type}\t{obj_id}\t-\tWeight\t{weight}")
# Finally also save sim_matrix # Finally also save sim_matrix