diff --git a/Phase 2/task3.ipynb b/Phase 2/task3.ipynb
deleted file mode 100644
index ba60981..0000000
--- a/Phase 2/task3.ipynb	
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "from pymongo import MongoClient\n",
-    "from task0a import *\n",
-    "import scipy\n",
-    "import numpy as np\n",
-    "from sklearn.decomposition import NMF\n",
-    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
-    "from sklearn.cluster import KMeans\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "client = MongoClient()\n",
-    "client = MongoClient(host=\"localhost\", port=27017)\n",
-    "\n",
-    "# Select the database\n",
-    "db = client.Multimedia_Web_DBs\n",
-    "\n",
-    "# Fetch all documents from the collection and then sort them by \"_id\"\n",
-    "feature_descriptors = list(db.Caltech101_Feature_Descriptors.find({}))\n",
-    "feature_descriptors = sorted(list(db.Caltech101_Feature_Descriptors.find({})), key=lambda x: x[\"_id\"], reverse=False)\n",
-    "\n",
-    "num_labels = 101"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extractKLatentSemantics(k, feature_model, dim_reduction):\n",
-    "\n",
-    "  feature_vectors = [x[feature_model] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "  feature_labels = [x[\"label\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "  feature_ids = [x[\"_id\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "\n",
-    "  filename = ''\n",
-    "\n",
-    "\n",
-    "  match dim_reduction:\n",
-    "\n",
-    "    case 1:\n",
-    "      filename = f'{feature_model}-svd-semantics.json'\n",
-    "      U, S, Vh = scipy.sparse.linalg.svds(np.array(feature_vectors), k=k)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 2:\n",
-    "      filename = f'{feature_model}-nnmf-semantics.json'\n",
-    "      model = NMF(n_components = k, init = 'random', solver = 'cd', alpha_H = 0.01, alpha_W = 0.01, max_iter = 10000)\n",
-    "      min_value = np.min(feature_vectors)\n",
-    "      feature_vectors_shifted = feature_vectors - min_value\n",
-    "      U = model.fit_transform(np.array(feature_vectors_shifted))\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 3:\n",
-    "      filename = f'{feature_model}-lda-semantics.json'\n",
-    "      U = LinearDiscriminantAnalysis(n_components = k).fit_transform(feature_vectors, feature_labels)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 4:\n",
-    "      filename = f'{feature_model}-kmeans-semantics.json'\n",
-    "      kmeans = KMeans(n_clusters = k)\n",
-    "      kmeans.fit(feature_vectors)\n",
-    "      U = kmeans.transform(feature_vectors)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "  \n",
-    "  k_latent_semantics = [{\"_id\": item[0], \"semantics\": item[1]} for item in k_latent_semantics]\n",
-    "  with open(filename, 'w', encoding='utf-8') as f:\n",
-    "    json.dump(k_latent_semantics, f, ensure_ascii = False)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def main():\n",
-    "\n",
-    "  # Load dataset\n",
-    "\n",
-    "  # User input for Image ID\n",
-    "  k = int(input(\"Enter k: \"))\n",
-    "\n",
-    "  features = ['color_moments', 'hog', 'layer3', 'avgpool', 'fc']\n",
-    "\n",
-    "  # User input for feature model to extract\n",
-    "  print(\"\\n1: Color moments\")\n",
-    "  print(\"2: HOG\")\n",
-    "  print(\"3: Resnet50 Avgpool layer\")\n",
-    "  print(\"4: Resnet50 Layer 3\")\n",
-    "  print(\"5: Resnet50 FC layer\")\n",
-    "  feature_model = features[int(input(\"Select the feature model: \")) - 1]\n",
-    "\n",
-    "  print(\"\\n1. SVD\")\n",
-    "  print(\"2. NNMF\")\n",
-    "  print(\"3. LDA\")\n",
-    "  print(\"4. k-means\")\n",
-    "  dim_reduction = int(input(\"Select the dimensionality reduction technique: \"))\n",
-    "\n",
-    "  extractKLatentSemantics(k, feature_model, dim_reduction)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if __name__ == \"__main__\":\n",
-    "   main()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/Phase 2/task_5.ipynb b/Phase 2/task_5.ipynb
new file mode 100644
index 0000000..7f025ea
--- /dev/null
+++ b/Phase 2/task_5.ipynb	
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import *\n",
+    "warnings.filterwarnings('ignore')\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Applying svd on the fc_fd space to get 10 latent semantics (showing only top 10 image-weight pairs for each latent semantic)...\n",
+      "Latent semantic no. 0\n",
+      "Image_ID\t80\t-\tWeight\t0.2614097705550824\n",
+      "Image_ID\t74\t-\tWeight\t0.255431983850539\n",
+      "Image_ID\t72\t-\tWeight\t0.24329045773521019\n",
+      "Image_ID\t76\t-\tWeight\t0.22867416408250565\n",
+      "Image_ID\t38\t-\tWeight\t0.19933358228759127\n",
+      "Image_ID\t70\t-\tWeight\t0.18697368408982706\n",
+      "Image_ID\t78\t-\tWeight\t0.13796715203849405\n",
+      "Image_ID\t130\t-\tWeight\t0.12802644225327572\n",
+      "Image_ID\t128\t-\tWeight\t0.12766513481071043\n",
+      "Image_ID\t116\t-\tWeight\t0.12432195172872901\n",
+      "Latent semantic no. 1\n",
+      "Image_ID\t42\t-\tWeight\t0.24451953308549035\n",
+      "Image_ID\t104\t-\tWeight\t0.17513827022527176\n",
+      "Image_ID\t2\t-\tWeight\t0.17502495949250704\n",
+      "Image_ID\t0\t-\tWeight\t0.17209867451969002\n",
+      "Image_ID\t170\t-\tWeight\t0.16656363902027468\n",
+      "Image_ID\t96\t-\tWeight\t0.15318453472976815\n",
+      "Image_ID\t40\t-\tWeight\t0.1432149719665029\n",
+      "Image_ID\t44\t-\tWeight\t0.1429496131499582\n",
+      "Image_ID\t160\t-\tWeight\t0.13479710738132986\n",
+      "Image_ID\t6\t-\tWeight\t0.1264545662660414\n",
+      "Latent semantic no. 2\n",
+      "Image_ID\t86\t-\tWeight\t0.21244971577008848\n",
+      "Image_ID\t96\t-\tWeight\t0.19744514449239337\n",
+      "Image_ID\t90\t-\tWeight\t0.19463642108355275\n",
+      "Image_ID\t32\t-\tWeight\t0.18145091969843855\n",
+      "Image_ID\t42\t-\tWeight\t0.16316970985189788\n",
+      "Image_ID\t26\t-\tWeight\t0.15711519451212017\n",
+      "Image_ID\t184\t-\tWeight\t0.14991640994990046\n",
+      "Image_ID\t134\t-\tWeight\t0.1462330756631442\n",
+      "Image_ID\t40\t-\tWeight\t0.14437675159652016\n",
+      "Image_ID\t182\t-\tWeight\t0.1383518461119224\n",
+      "Latent semantic no. 3\n",
+      "Image_ID\t90\t-\tWeight\t0.1720078267722524\n",
+      "Image_ID\t156\t-\tWeight\t0.16000154385617743\n",
+      "Image_ID\t158\t-\tWeight\t0.1512646317732056\n",
+      "Image_ID\t160\t-\tWeight\t0.14646801598350143\n",
+      "Image_ID\t152\t-\tWeight\t0.1464352560589073\n",
+      "Image_ID\t150\t-\tWeight\t0.14619374900432364\n",
+      "Image_ID\t30\t-\tWeight\t0.14143498327111978\n",
+      "Image_ID\t36\t-\tWeight\t0.14028252934190766\n",
+      "Image_ID\t92\t-\tWeight\t0.14010606099568526\n",
+      "Image_ID\t96\t-\tWeight\t0.12878454015856147\n",
+      "Latent semantic no. 4\n",
+      "Image_ID\t0\t-\tWeight\t0.1851068625752792\n",
+      "Image_ID\t68\t-\tWeight\t0.18233577289211206\n",
+      "Image_ID\t70\t-\tWeight\t0.17658848660973384\n",
+      "Image_ID\t2\t-\tWeight\t0.1740864069632969\n",
+      "Image_ID\t64\t-\tWeight\t0.1652208125636303\n",
+      "Image_ID\t144\t-\tWeight\t0.1473307832877541\n",
+      "Image_ID\t140\t-\tWeight\t0.13555748295430797\n",
+      "Image_ID\t142\t-\tWeight\t0.12823249250147356\n",
+      "Image_ID\t86\t-\tWeight\t0.12718092599165637\n",
+      "Image_ID\t76\t-\tWeight\t0.1252879989162334\n",
+      "Latent semantic no. 5\n",
+      "Image_ID\t38\t-\tWeight\t0.18831453133913492\n",
+      "Image_ID\t44\t-\tWeight\t0.17741038115946053\n",
+      "Image_ID\t42\t-\tWeight\t0.16444727858214978\n",
+      "Image_ID\t130\t-\tWeight\t0.15436113645002744\n",
+      "Image_ID\t40\t-\tWeight\t0.1536450181907607\n",
+      "Image_ID\t132\t-\tWeight\t0.14964910372393345\n",
+      "Image_ID\t46\t-\tWeight\t0.147369630386678\n",
+      "Image_ID\t36\t-\tWeight\t0.14003912645014002\n",
+      "Image_ID\t128\t-\tWeight\t0.13864439525825356\n",
+      "Image_ID\t138\t-\tWeight\t0.13770732538821512\n",
+      "Latent semantic no. 6\n",
+      "Image_ID\t114\t-\tWeight\t0.15664448468019831\n",
+      "Image_ID\t2\t-\tWeight\t0.15491061836983144\n",
+      "Image_ID\t0\t-\tWeight\t0.1530303208538504\n",
+      "Image_ID\t6\t-\tWeight\t0.15295162665264536\n",
+      "Image_ID\t106\t-\tWeight\t0.14505207452002586\n",
+      "Image_ID\t110\t-\tWeight\t0.14364619871330633\n",
+      "Image_ID\t104\t-\tWeight\t0.14360445482307752\n",
+      "Image_ID\t116\t-\tWeight\t0.14309751290704328\n",
+      "Image_ID\t108\t-\tWeight\t0.14103122187663494\n",
+      "Image_ID\t112\t-\tWeight\t0.13936814882577545\n",
+      "Latent semantic no. 7\n",
+      "Image_ID\t158\t-\tWeight\t0.15332739573127638\n",
+      "Image_ID\t152\t-\tWeight\t0.15027095321242787\n",
+      "Image_ID\t2\t-\tWeight\t0.148228537938103\n",
+      "Image_ID\t0\t-\tWeight\t0.14693245027728857\n",
+      "Image_ID\t156\t-\tWeight\t0.1439438847861891\n",
+      "Image_ID\t8\t-\tWeight\t0.14356918947005834\n",
+      "Image_ID\t10\t-\tWeight\t0.1431162549061445\n",
+      "Image_ID\t6\t-\tWeight\t0.14277108702825383\n",
+      "Image_ID\t150\t-\tWeight\t0.1424099571884803\n",
+      "Image_ID\t164\t-\tWeight\t0.13731169848767164\n",
+      "Latent semantic no. 8\n",
+      "Image_ID\t136\t-\tWeight\t0.14826723874051348\n",
+      "Image_ID\t142\t-\tWeight\t0.1444905135922577\n",
+      "Image_ID\t116\t-\tWeight\t0.14310970423245634\n",
+      "Image_ID\t132\t-\tWeight\t0.13967210710664973\n",
+      "Image_ID\t152\t-\tWeight\t0.13699976834141417\n",
+      "Image_ID\t114\t-\tWeight\t0.13649814331495427\n",
+      "Image_ID\t138\t-\tWeight\t0.13624706512987708\n",
+      "Image_ID\t106\t-\tWeight\t0.13620952950667425\n",
+      "Image_ID\t110\t-\tWeight\t0.1346054901033104\n",
+      "Image_ID\t144\t-\tWeight\t0.13436573258693213\n",
+      "Latent semantic no. 9\n",
+      "Image_ID\t38\t-\tWeight\t0.15911686596038474\n",
+      "Image_ID\t2\t-\tWeight\t0.15207108925634513\n",
+      "Image_ID\t0\t-\tWeight\t0.15116756158498235\n",
+      "Image_ID\t6\t-\tWeight\t0.15009399187071035\n",
+      "Image_ID\t10\t-\tWeight\t0.14437025978168486\n",
+      "Image_ID\t4\t-\tWeight\t0.14315858315130434\n",
+      "Image_ID\t34\t-\tWeight\t0.14296451776950192\n",
+      "Image_ID\t22\t-\tWeight\t0.14272703151065388\n",
+      "Image_ID\t24\t-\tWeight\t0.14254462871698045\n",
+      "Image_ID\t20\t-\tWeight\t0.14096073579756538\n"
+     ]
+    }
+   ],
+   "source": [
+    "selected_feature_model = valid_feature_models[\n",
+    "    str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
+    "]\n",
+    "\n",
+    "k = int(input(\"Enter value of k: \"))\n",
+    "if k < 1:\n",
+    "    raise ValueError(\"k should be a positive integer\")\n",
+    "\n",
+    "selected_dim_reduction_method = str(\n",
+    "    input(\n",
+    "        \"Enter dimensionality reduction method - one of \"\n",
+    "        + str(list(valid_dim_reduction_methods.keys()))\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "label_sim_matrix = find_label_label_similarity(fd_collection,selected_feature_model)\n",
+    "\n",
+    "extract_latent_semantics(\n",
+    "    fd_collection,\n",
+    "    k,\n",
+    "    selected_feature_model,\n",
+    "    selected_dim_reduction_method,\n",
+    "    sim_matrix=label_sim_matrix,\n",
+    "    top_images=10,\n",
+    "    fn_prefix='label_sim-'\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Phase 2/utils.py b/Phase 2/utils.py
index 3e822bd..312d324 100644
--- a/Phase 2/utils.py	
+++ b/Phase 2/utils.py	
@@ -641,9 +641,9 @@ class KMeans:
             for c in self.cluster_centers_:
                 prev_centroid = prev_centroids[c]
                 current_centroid = self.cluster_centers_[c]
-                convergence_tol = np.sum(abs(
-                    (prev_centroid - current_centroid) / prev_centroid * 100.0
-                ))
+                convergence_tol = np.sum(
+                    abs((prev_centroid - current_centroid) / prev_centroid * 100.0)
+                )
                 if convergence_tol > self.tol:
                     optimized = False
                     if self.verbose > 0:
@@ -676,11 +676,19 @@ class KMeans:
 
 
 def extract_latent_semantics(
-    fd_collection, k, feature_model, dim_reduction_method, top_images=None
+    fd_collection,
+    k,
+    feature_model,
+    dim_reduction_method,
+    sim_matrix=None,
+    top_images=None,
+    fn_prefix="",
 ):
     """
     Extract latent semantics for entire collection at once for a given feature_model and dim_reduction_method, and display the imageID-semantic weight pairs
 
+    Use `sim_matrix` to manually give similarity matrix instead of feature space
+
     Leave `top_images` blank to display all imageID-weight pairs
     """
 
@@ -694,18 +702,28 @@ def extract_latent_semantics(
     )
 
     all_images = list(fd_collection.find())
-    feature_vectors = np.array([img[feature_model] for img in all_images])
-    feature_labels = [img["true_label"] for img in all_images]
     feature_ids = [img["image_id"] for img in all_images]
-
+    
     top_img_str = ""
     if top_images is not None:
         top_img_str = f" (showing only top {top_images} image-weight pairs for each latent semantic)"
-    print(
-        "Applying {} on the {} space to get {} latent semantics{}...".format(
-            dim_reduction_method, feature_model, k, top_img_str
+
+    # if similarity matrix is provided
+    if sim_matrix is not None:
+        feature_vectors = sim_matrix
+        print(
+            "Applying {} on the {} space to get {} latent semantics{}...".format(
+                dim_reduction_method, feature_model, k, top_img_str
+            )
+        )
+    # else take feature space from database
+    else:
+        feature_vectors = np.array([img[feature_model] for img in all_images])
+        print(
+            "Applying {} on the given similarity matrix to get {} latent semantics{}...".format(
+                dim_reduction_method, k, top_img_str
+            )
         )
-    )
 
     displayed_latent_semantics = {}
     all_latent_semantics = {}
@@ -827,8 +845,38 @@ def extract_latent_semantics(
             print(f"Image_ID\t{image_id}\t-\tWeight\t{weight}")
 
     with open(
-        f"{feature_model}-{dim_reduction_method}-{k}-semantics.json",
+        f"{fn_prefix}{feature_model}-{dim_reduction_method}-{k}-semantics.json",
         "w",
         encoding="utf-8",
     ) as output_file:
         json.dump(all_latent_semantics, output_file, ensure_ascii=False)
+
+
+def find_label_label_similarity(fd_collection, feature_model):
+    """
+    Calculate similarity between labels. Lower values indicate higher similarities
+    """
+    assert (
+        feature_model in valid_feature_models.values()
+    ), "feature_model should be one of " + str(list(valid_feature_models.keys()))
+
+    label_sim_matrix = []
+    label_mean_vectors = []
+
+    num_labels = 101
+
+    for label in range(num_labels):
+        # get representative vectors for the label
+        label_mean_vectors.append(
+            calculate_label_representatives(fd_collection, label, feature_model)
+        )
+
+    label_sim_matrix = np.zeros((num_labels, num_labels))
+
+    for i in range(num_labels):
+        for j in range(i + 1, num_labels):
+            # Note: lower the value, lower the distance => higher the similarity
+            label_sim_matrix[i][j] = feature_distance_matches[feature_model](
+                np.array(label_mean_vectors[i]), np.array(label_mean_vectors[j])
+            )
+    return label_sim_matrix