refactored niraj's task 4 code

This commit is contained in:
Kaushik Narayan R 2023-10-13 12:56:36 -07:00
parent b935d9ca34
commit eb0d50b1cd
2 changed files with 111 additions and 13 deletions

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -13,7 +13,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -25,7 +25,22 @@
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": [
"selected_feature_model = valid_feature_models[\n",
" str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
"]\n",
"\n",
"k = int(input(\"Enter value of k: \"))\n",
"if k < 1:\n",
" raise ValueError(\"k should be a positive integer\")\n",
"\n",
"extract_CP_semantics_from_feature_model(\n",
" fd_collection,\n",
" k,\n",
" selected_feature_model,\n",
" top_images=10\n",
")"
]
} }
], ],
"metadata": { "metadata": {

View File

@ -25,6 +25,7 @@ import json
from os import getenv from os import getenv
from dotenv import load_dotenv from dotenv import load_dotenv
import warnings import warnings
from joblib import dump, load
load_dotenv() load_dotenv()
@ -716,6 +717,7 @@ valid_dim_reduction_methods = {
"kmeans": 4, "kmeans": 4,
} }
class KMeans: class KMeans:
def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0): def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0):
self.n_clusters = n_clusters self.n_clusters = n_clusters
@ -888,9 +890,9 @@ def extract_latent_semantics_from_feature_model(
) )
model.fit(feature_vectors_shifted) model.fit(feature_vectors_shifted)
# K (k x fd_dim) is the factor matrix for latent semantic-feature pairs # K (k x fd_dim) is the pseudocount for latent semantic-feature pairs
K = model.components_ K = model.components_
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs # X (4339 x k) is the image-semantic distribution (image ID-latent semantic pairs)
X = model.transform(feature_vectors_shifted) X = model.transform(feature_vectors_shifted)
all_latent_semantics = { all_latent_semantics = {
@ -898,6 +900,8 @@ def extract_latent_semantics_from_feature_model(
"semantic-feature": K.tolist(), "semantic-feature": K.tolist(),
} }
dump(model, f"{feature_model}-{dim_reduction_method}-{k}-model.joblib")
# for each latent semantic, sort imageID-weight pairs by weights in descending order # for each latent semantic, sort imageID-weight pairs by weights in descending order
displayed_latent_semantics = [ displayed_latent_semantics = [
sorted( sorted(
@ -1060,9 +1064,9 @@ def extract_latent_semantics_from_sim_matrix(
) )
model.fit(feature_vectors_shifted) model.fit(feature_vectors_shifted)
# K (k x fd_dim) is the factor matrix for latent semantic-feature pairs # K (k x fd_dim) is the pseudocount for latent semantic-feature pairs
K = model.components_ K = model.components_
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs # X (4339 x k) is the image-semantic distribution (image ID-latent semantic pairs)
X = model.transform(feature_vectors_shifted) X = model.transform(feature_vectors_shifted)
all_latent_semantics = { all_latent_semantics = {
@ -1070,6 +1074,11 @@ def extract_latent_semantics_from_sim_matrix(
"semantic-feature": K.tolist(), "semantic-feature": K.tolist(),
} }
dump(
model,
f"{sim_type}-{feature_model}-{dim_reduction_method}-{k}-model.joblib",
)
# for each latent semantic, sort object-weight pairs by weights in descending order # for each latent semantic, sort object-weight pairs by weights in descending order
displayed_latent_semantics = [ displayed_latent_semantics = [
sorted( sorted(
@ -1175,6 +1184,7 @@ def find_image_image_similarity(fd_collection, feature_model):
](np.array(feature_vectors[i]), np.array(feature_vectors[j])) ](np.array(feature_vectors[i]), np.array(feature_vectors[j]))
return image_sim_matrix return image_sim_matrix
def compute_cp_decomposition(fd_collection, feature_model, rank): def compute_cp_decomposition(fd_collection, feature_model, rank):
assert ( assert (
feature_model in valid_feature_models.values() feature_model in valid_feature_models.values()
@ -1183,11 +1193,84 @@ def compute_cp_decomposition(fd_collection, feature_model, rank):
all_images = list(fd_collection.find()) all_images = list(fd_collection.find())
# (images, features, labels) # (images, features, labels)
data_tensor_shape = (NUM_IMAGES, len(all_images[0][feature_model]), NUM_LABELS) data_tensor_shape = (
NUM_IMAGES,
np.array(all_images[0][feature_model]).flatten().shape[0],
NUM_LABELS,
)
data_tensor = np.zeros(data_tensor_shape) data_tensor = np.zeros(data_tensor_shape)
for id in range(NUM_IMAGES): print(data_tensor_shape)
label = all_images[id]["true_label"]
data_tensor[id, :, label] = all_images[id][feature_model] # create data tensor
for img_id in range(NUM_IMAGES):
weights_tensor, factor_matrices = tl.decomposition.parafac(data_tensor, rank=rank, normalize_factors=True) label = all_images[img_id]["true_label"]
data_tensor[img_id, :, label] = np.array(
all_images[img_id][feature_model]
).flatten()
weights_tensor, factor_matrices = tl.decomposition.parafac(
data_tensor, rank=rank, normalize_factors=True
)
return weights_tensor, factor_matrices return weights_tensor, factor_matrices
def extract_CP_semantics_from_feature_model(
fd_collection,
rank,
feature_model,
top_images=None,
):
assert (
feature_model in valid_feature_models.values()
), "feature_model should be one of " + str(list(valid_feature_models.keys()))
top_img_str = ""
if top_images is not None:
top_img_str = f" (showing only top {top_images} image-weight pairs for each latent semantic)"
print(
"Applying CP decomposition on the {} space to get {} latent semantics{}...".format(
feature_model, rank, top_img_str
)
)
all_images = list(fd_collection.find())
img_ids = [img for img in range(NUM_IMAGES)]
img_feature_ids = [
feature_num for feature_num in range(len(all_images[0][feature_model]))
]
img_label_ids = [label for label in range(NUM_LABELS)]
feature_ids = [img_ids, img_feature_ids, img_label_ids]
weights_tensor, factor_matrices = compute_cp_decomposition(
fd_collection, feature_model, rank
)
all_latent_semantics = {
"image-semantic": factor_matrices[0].tolist(),
"feature-semantic": factor_matrices[1].tolist(),
"label-semantic": factor_matrices[2].tolist(),
"semantics-core": weights_tensor.tolist(),
}
strs = ["image", "feature", "label"]
for i in range(3):
displayed_latent_semantics = [
sorted(
list(zip(feature_ids[i], latent_semantic)),
key=lambda x: x[1],
reverse=True,
)[:top_images]
for latent_semantic in factor_matrices[i]
]
print(f"Showing {strs[i]}-weight latent semantic")
for idx, latent_semantic in enumerate(displayed_latent_semantics):
print(f"Latent semantic no. {idx}")
for obj_id, weight in latent_semantic:
print(f"{strs[i]}\t{obj_id}\t-\tweight\t{weight}")
with open(
f"{feature_model}-cp-{rank}-semantics.json",
"w",
encoding="utf-8",
) as output_file:
json.dump(all_latent_semantics, output_file, ensure_ascii=False)