mirror of
https://github.com/20kaushik02/CSE515_MWDB_Project.git
synced 2025-12-06 09:24:07 +00:00
refactored niraj's task 4 code
This commit is contained in:
parent
b935d9ca34
commit
eb0d50b1cd
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -13,7 +13,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -25,7 +25,22 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"selected_feature_model = valid_feature_models[\n",
|
||||
" str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"k = int(input(\"Enter value of k: \"))\n",
|
||||
"if k < 1:\n",
|
||||
" raise ValueError(\"k should be a positive integer\")\n",
|
||||
"\n",
|
||||
"extract_CP_semantics_from_feature_model(\n",
|
||||
" fd_collection,\n",
|
||||
" k,\n",
|
||||
" selected_feature_model,\n",
|
||||
" top_images=10\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
103
Phase 2/utils.py
103
Phase 2/utils.py
@ -25,6 +25,7 @@ import json
|
||||
from os import getenv
|
||||
from dotenv import load_dotenv
|
||||
import warnings
|
||||
from joblib import dump, load
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -716,6 +717,7 @@ valid_dim_reduction_methods = {
|
||||
"kmeans": 4,
|
||||
}
|
||||
|
||||
|
||||
class KMeans:
|
||||
def __init__(self, n_clusters, tol=0.001, max_iter=300, verbose=0):
|
||||
self.n_clusters = n_clusters
|
||||
@ -888,9 +890,9 @@ def extract_latent_semantics_from_feature_model(
|
||||
)
|
||||
model.fit(feature_vectors_shifted)
|
||||
|
||||
# K (k x fd_dim) is the factor matrix for latent semantic-feature pairs
|
||||
# K (k x fd_dim) is the pseudocount for latent semantic-feature pairs
|
||||
K = model.components_
|
||||
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
|
||||
# X (4339 x k) is the image-semantic distribution (image ID-latent semantic pairs)
|
||||
X = model.transform(feature_vectors_shifted)
|
||||
|
||||
all_latent_semantics = {
|
||||
@ -898,6 +900,8 @@ def extract_latent_semantics_from_feature_model(
|
||||
"semantic-feature": K.tolist(),
|
||||
}
|
||||
|
||||
dump(model, f"{feature_model}-{dim_reduction_method}-{k}-model.joblib")
|
||||
|
||||
# for each latent semantic, sort imageID-weight pairs by weights in descending order
|
||||
displayed_latent_semantics = [
|
||||
sorted(
|
||||
@ -1060,9 +1064,9 @@ def extract_latent_semantics_from_sim_matrix(
|
||||
)
|
||||
model.fit(feature_vectors_shifted)
|
||||
|
||||
# K (k x fd_dim) is the factor matrix for latent semantic-feature pairs
|
||||
# K (k x fd_dim) is the pseudocount for latent semantic-feature pairs
|
||||
K = model.components_
|
||||
# X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
|
||||
# X (4339 x k) is the image-semantic distribution (image ID-latent semantic pairs)
|
||||
X = model.transform(feature_vectors_shifted)
|
||||
|
||||
all_latent_semantics = {
|
||||
@ -1070,6 +1074,11 @@ def extract_latent_semantics_from_sim_matrix(
|
||||
"semantic-feature": K.tolist(),
|
||||
}
|
||||
|
||||
dump(
|
||||
model,
|
||||
f"{sim_type}-{feature_model}-{dim_reduction_method}-{k}-model.joblib",
|
||||
)
|
||||
|
||||
# for each latent semantic, sort object-weight pairs by weights in descending order
|
||||
displayed_latent_semantics = [
|
||||
sorted(
|
||||
@ -1175,6 +1184,7 @@ def find_image_image_similarity(fd_collection, feature_model):
|
||||
](np.array(feature_vectors[i]), np.array(feature_vectors[j]))
|
||||
return image_sim_matrix
|
||||
|
||||
|
||||
def compute_cp_decomposition(fd_collection, feature_model, rank):
|
||||
assert (
|
||||
feature_model in valid_feature_models.values()
|
||||
@ -1183,11 +1193,84 @@ def compute_cp_decomposition(fd_collection, feature_model, rank):
|
||||
all_images = list(fd_collection.find())
|
||||
|
||||
# (images, features, labels)
|
||||
data_tensor_shape = (NUM_IMAGES, len(all_images[0][feature_model]), NUM_LABELS)
|
||||
data_tensor_shape = (
|
||||
NUM_IMAGES,
|
||||
np.array(all_images[0][feature_model]).flatten().shape[0],
|
||||
NUM_LABELS,
|
||||
)
|
||||
data_tensor = np.zeros(data_tensor_shape)
|
||||
for id in range(NUM_IMAGES):
|
||||
label = all_images[id]["true_label"]
|
||||
data_tensor[id, :, label] = all_images[id][feature_model]
|
||||
|
||||
weights_tensor, factor_matrices = tl.decomposition.parafac(data_tensor, rank=rank, normalize_factors=True)
|
||||
print(data_tensor_shape)
|
||||
|
||||
# create data tensor
|
||||
for img_id in range(NUM_IMAGES):
|
||||
label = all_images[img_id]["true_label"]
|
||||
data_tensor[img_id, :, label] = np.array(
|
||||
all_images[img_id][feature_model]
|
||||
).flatten()
|
||||
|
||||
weights_tensor, factor_matrices = tl.decomposition.parafac(
|
||||
data_tensor, rank=rank, normalize_factors=True
|
||||
)
|
||||
return weights_tensor, factor_matrices
|
||||
|
||||
|
||||
def extract_CP_semantics_from_feature_model(
|
||||
fd_collection,
|
||||
rank,
|
||||
feature_model,
|
||||
top_images=None,
|
||||
):
|
||||
assert (
|
||||
feature_model in valid_feature_models.values()
|
||||
), "feature_model should be one of " + str(list(valid_feature_models.keys()))
|
||||
|
||||
top_img_str = ""
|
||||
if top_images is not None:
|
||||
top_img_str = f" (showing only top {top_images} image-weight pairs for each latent semantic)"
|
||||
print(
|
||||
"Applying CP decomposition on the {} space to get {} latent semantics{}...".format(
|
||||
feature_model, rank, top_img_str
|
||||
)
|
||||
)
|
||||
|
||||
all_images = list(fd_collection.find())
|
||||
img_ids = [img for img in range(NUM_IMAGES)]
|
||||
img_feature_ids = [
|
||||
feature_num for feature_num in range(len(all_images[0][feature_model]))
|
||||
]
|
||||
img_label_ids = [label for label in range(NUM_LABELS)]
|
||||
feature_ids = [img_ids, img_feature_ids, img_label_ids]
|
||||
|
||||
weights_tensor, factor_matrices = compute_cp_decomposition(
|
||||
fd_collection, feature_model, rank
|
||||
)
|
||||
|
||||
all_latent_semantics = {
|
||||
"image-semantic": factor_matrices[0].tolist(),
|
||||
"feature-semantic": factor_matrices[1].tolist(),
|
||||
"label-semantic": factor_matrices[2].tolist(),
|
||||
"semantics-core": weights_tensor.tolist(),
|
||||
}
|
||||
|
||||
strs = ["image", "feature", "label"]
|
||||
for i in range(3):
|
||||
displayed_latent_semantics = [
|
||||
sorted(
|
||||
list(zip(feature_ids[i], latent_semantic)),
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)[:top_images]
|
||||
for latent_semantic in factor_matrices[i]
|
||||
]
|
||||
print(f"Showing {strs[i]}-weight latent semantic")
|
||||
for idx, latent_semantic in enumerate(displayed_latent_semantics):
|
||||
print(f"Latent semantic no. {idx}")
|
||||
for obj_id, weight in latent_semantic:
|
||||
print(f"{strs[i]}\t{obj_id}\t-\tweight\t{weight}")
|
||||
|
||||
with open(
|
||||
f"{feature_model}-cp-{rank}-semantics.json",
|
||||
"w",
|
||||
encoding="utf-8",
|
||||
) as output_file:
|
||||
json.dump(all_latent_semantics, output_file, ensure_ascii=False)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user