refactored pranav's task 3 code

changed latent semantic storage, LDA is latent dirichlet allocation and image-weight arrangement is reversed
2026-03-11 23:34:05 +00:00 · 2023-10-10 14:58:28 -07:00
parent 5580611ba4
commit 78be91a0ca
4 changed files with 368 additions and 320 deletions
--- a/2/task1.ipynb
+++ b/2/task1.ipynb
@@ -1,218 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'task0a'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "\u001b[1;32me:\\Fall 23\\CSE 515 - Multimedia and web databases\\CSE515_MWDB_Project\\Phase 2\\task1.ipynb Cell 1\u001b[0m line \u001b[0;36m4\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W0sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmath\u001b[39;00m\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W0sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W0sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtask0a\u001b[39;00m \u001b[39mimport\u001b[39;00m \u001b[39m*\u001b[39m\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W0sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mscipy\u001b[39;00m\n",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'task0a'"
-     ]
-    }
-   ],
-   "source": [
-    "from pymongo import MongoClient\n",
-    "import math\n",
-    "import matplotlib.pyplot as plt\n",
-    "# This was imported for the loadDataset function in the cell below\n",
-    "from task0a import *\n",
-    "import scipy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'loadDataset' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32me:\\Fall 23\\CSE 515 - Multimedia and web databases\\CSE515_MWDB_Project\\Phase 2\\task1.ipynb Cell 2\u001b[0m line \u001b[0;36m7\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W1sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39m# Select the database\u001b[39;00m\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W1sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m db \u001b[39m=\u001b[39m client\u001b[39m.\u001b[39mMultimedia_Web_DBs\n\u001b[1;32m----> <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W1sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m caltechDataset \u001b[39m=\u001b[39m loadDataset()\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W1sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m \u001b[39m# Fetch all documents from the collection and then sort them by \"_id\"\u001b[39;00m\n\u001b[0;32m     <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W1sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m feature_descriptors \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(db\u001b[39m.\u001b[39mCaltech101_Feature_Descriptors\u001b[39m.\u001b[39mfind({}))\n",
-      "\u001b[1;31mNameError\u001b[0m: name 'loadDataset' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "client = MongoClient()\n",
-    "client = MongoClient(host=\"localhost\", port=27017)\n",
-    "\n",
-    "# Select the database\n",
-    "db = client.Multimedia_Web_DBs\n",
-    "\n",
-    "# This function was the part of task 1 in my project directory. \n",
-    "# caltechDataset is in format (_id, image_pixels, label)\n",
-    "caltechDataset = loadDataset()\n",
-    "\n",
-    "# Fetch all documents from the collection and then sort them by \"_id\"\n",
-    "feature_descriptors = list(db.Caltech101_Feature_Descriptors.find({}))\n",
-    "feature_descriptors = sorted(list(db.Caltech101_Feature_Descriptors.find({})), key=lambda x: x[\"_id\"], reverse=False)\n",
-    "\n",
-    "num_labels = 101"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_label_means(l, feature_model):\n",
-    "  \n",
-    "  # Just picking the feature vector for that particular label from even _id rows in the dataset\n",
-    "  label_vectors = [x[feature_model] for x in feature_descriptors if x[\"label\"] == l and x[\"_id\"] % 2 == 0]\n",
-    " \n",
-    "  label_mean_vector = [sum(col)/len(col) for col in zip(*label_vectors)]\n",
-    "  return label_mean_vector"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def findKRelevantImages(mean_vector, feature_model, l):\n",
-    "\n",
-    "  # Same as in above function, but took ids as well.\n",
-    "  # Redundant step.\n",
-    "  label_vectors = [(x[\"_id\"], x[feature_model]) for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "\n",
-    "  n = len(label_vectors)\n",
-    "\n",
-    "  similarities = []\n",
-    "\n",
-    "  # Use the appropriate similarity based on feature model selected by the user\n",
-    "  match feature_model:\n",
-    "\n",
-    "    case \"color_moments\":\n",
-    "\n",
-    "      for i in range(n):\n",
-    "        similarities.append({\"_id\": label_vectors[i][0], \"similarity\": math.dist(mean_vector, label_vectors[i][1])})\n",
-    "      similarities = sorted(similarities, key=lambda x: x[\"similarity\"], reverse=False)\n",
-    "\n",
-    "    case \"hog\":\n",
-    "\n",
-    "      for i in range(n):\n",
-    "        similarities.append({\"_id\": label_vectors[i][0], \"similarity\": (np.dot(mean_vector, label_vectors[i][1]) / (np.linalg.norm(mean_vector) * np.linalg.norm(label_vectors[i][1])))})\n",
-    "      similarities = sorted(similarities, key=lambda x: x[\"similarity\"], reverse=True)\n",
-    "    \n",
-    "    case \"layer3\" | \"avgpool\" | \"fc\":\n",
-    "\n",
-    "      for i in range(n):\n",
-    "        similarities.append({\"_id\": label_vectors[i][0], \"similarity\": scipy.stats.pearsonr(mean_vector, label_vectors[i][1]).statistic})\n",
-    "      similarities = sorted(similarities, key=lambda x: x[\"similarity\"], reverse=True)\n",
-    "    \n",
-    "  return similarities\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def main():\n",
-    "\n",
-    "  # Load dataset\n",
-    "\n",
-    "  # User input for Image ID\n",
-    "  l = int(input(\"Enter query label: \"))\n",
-    "  k = int(input(\"Enter k: \"))\n",
-    "\n",
-    "  features = ['color_moments', 'hog', 'layer3', 'avgpool', 'fc']\n",
-    "\n",
-    "  # User input for feature model to extract\n",
-    "  print(\"1: Color moments\")\n",
-    "  print(\"2: HOG\")\n",
-    "  print(\"3: Resnet50 Avgpool layer\")\n",
-    "  print(\"4: Resnet50 Layer 3\")\n",
-    "  print(\"5: Resnet50 FC layer\")\n",
-    "  feature_model = features[int(input(\"Select the feature model: \")) - 1]\n",
-    "\n",
-    "  mean_vector = calculate_label_means(l, feature_model)\n",
-    "\n",
-    "  similar_images = findKRelevantImages(mean_vector, feature_model, l)\n",
-    "\n",
-    "  for i in range(k):\n",
-    "    print(similar_images[i])\n",
-    "\n",
-    "  # Show the \"k relevant images\"\n",
-    "  fig, axes = plt.subplots(1, k, figsize=(15, 5))\n",
-    "\n",
-    "  for i in range(k):\n",
-    "    # caltechDataset[similar_images[i][\"_id\"]][1] because\n",
-    "    # similar_images[i][\"_id\"] will provide me the image id\n",
-    "    # [1] will be image pixel values since caltechDataset is in format (id, pixels, label)\n",
-    "    axes[i].imshow(caltechDataset[similar_images[i][\"_id\"]][1].permute(1, 2, 0))\n",
-    "    axes[i].set_title(f'id: {similar_images[i][\"_id\"]}')\n",
-    "\n",
-    "  # Show the figure with all the images\n",
-    "  plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "Interrupted by user",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[1;32me:\\Fall 23\\CSE 515 - Multimedia and web databases\\CSE515_MWDB_Project\\Phase 2\\task1.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m__name__\u001b[39m \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[1;32m----> <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m    main()\n",
-      "\u001b[1;32me:\\Fall 23\\CSE 515 - Multimedia and web databases\\CSE515_MWDB_Project\\Phase 2\\task1.ipynb Cell 6\u001b[0m line \u001b[0;36m6\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mmain\u001b[39m():\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m   \u001b[39m# Load dataset\u001b[39;00m\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m   \u001b[39m# User input for Image ID\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m   l \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(\u001b[39minput\u001b[39;49m(\u001b[39m\"\u001b[39;49m\u001b[39mEnter query label: \u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m   k \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(\u001b[39minput\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mEnter k: \u001b[39m\u001b[39m\"\u001b[39m))\n\u001b[0;32m      <a href='vscode-notebook-cell:/e%3A/Fall%2023/CSE%20515%20-%20Multimedia%20and%20web%20databases/CSE515_MWDB_Project/Phase%202/task1.ipynb#W5sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m   features \u001b[39m=\u001b[39m [\u001b[39m'\u001b[39m\u001b[39mcolor_moments\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mhog\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mlayer3\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mavgpool\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mfc\u001b[39m\u001b[39m'\u001b[39m]\n",
-      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\ipykernel\\kernelbase.py:1202\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[1;34m(self, prompt)\u001b[0m\n\u001b[0;32m   1200\u001b[0m     msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m   1201\u001b[0m     \u001b[39mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[1;32m-> 1202\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_input_request(\n\u001b[0;32m   1203\u001b[0m     \u001b[39mstr\u001b[39;49m(prompt),\n\u001b[0;32m   1204\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent_ident[\u001b[39m\"\u001b[39;49m\u001b[39mshell\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[0;32m   1205\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mget_parent(\u001b[39m\"\u001b[39;49m\u001b[39mshell\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m   1206\u001b[0m     password\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m   1207\u001b[0m )\n",
-      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\ipykernel\\kernelbase.py:1245\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[1;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[0;32m   1242\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyboardInterrupt\u001b[39;00m:\n\u001b[0;32m   1243\u001b[0m     \u001b[39m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[0;32m   1244\u001b[0m     msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mInterrupted by user\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1245\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mKeyboardInterrupt\u001b[39;00m(msg) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m   1246\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[0;32m   1247\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlog\u001b[39m.\u001b[39mwarning(\u001b[39m\"\u001b[39m\u001b[39mInvalid Message:\u001b[39m\u001b[39m\"\u001b[39m, exc_info\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m: Interrupted by user"
-     ]
-    }
-   ],
-   "source": [
-    "if __name__ == \"__main__\":\n",
-    "   main()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/2/task3.ipynb
+++ b/2/task3.ipynb
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pymongo import MongoClient\n",
+    "from task0a import *\n",
+    "import scipy\n",
+    "import numpy as np\n",
+    "from sklearn.decomposition import NMF\n",
+    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+    "from sklearn.cluster import KMeans\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = MongoClient()\n",
+    "client = MongoClient(host=\"localhost\", port=27017)\n",
+    "\n",
+    "# Select the database\n",
+    "db = client.Multimedia_Web_DBs\n",
+    "\n",
+    "# Fetch all documents from the collection and then sort them by \"_id\"\n",
+    "feature_descriptors = list(db.Caltech101_Feature_Descriptors.find({}))\n",
+    "feature_descriptors = sorted(list(db.Caltech101_Feature_Descriptors.find({})), key=lambda x: x[\"_id\"], reverse=False)\n",
+    "\n",
+    "num_labels = 101"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extractKLatentSemantics(k, feature_model, dim_reduction):\n",
+    "\n",
+    "  feature_vectors = [x[feature_model] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
+    "  feature_labels = [x[\"label\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
+    "  feature_ids = [x[\"_id\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
+    "\n",
+    "  filename = ''\n",
+    "\n",
+    "\n",
+    "  match dim_reduction:\n",
+    "\n",
+    "    case 1:\n",
+    "      filename = f'{feature_model}-svd-semantics.json'\n",
+    "      U, S, Vh = scipy.sparse.linalg.svds(np.array(feature_vectors), k=k)\n",
+    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
+    "\n",
+    "    case 2:\n",
+    "      filename = f'{feature_model}-nnmf-semantics.json'\n",
+    "      model = NMF(n_components = k, init = 'random', solver = 'cd', alpha_H = 0.01, alpha_W = 0.01, max_iter = 10000)\n",
+    "      min_value = np.min(feature_vectors)\n",
+    "      feature_vectors_shifted = feature_vectors - min_value\n",
+    "      U = model.fit_transform(np.array(feature_vectors_shifted))\n",
+    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
+    "\n",
+    "    case 3:\n",
+    "      filename = f'{feature_model}-lda-semantics.json'\n",
+    "      U = LinearDiscriminantAnalysis(n_components = k).fit_transform(feature_vectors, feature_labels)\n",
+    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
+    "\n",
+    "    case 4:\n",
+    "      filename = f'{feature_model}-kmeans-semantics.json'\n",
+    "      kmeans = KMeans(n_clusters = k)\n",
+    "      kmeans.fit(feature_vectors)\n",
+    "      U = kmeans.transform(feature_vectors)\n",
+    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
+    "  \n",
+    "  k_latent_semantics = [{\"_id\": item[0], \"semantics\": item[1]} for item in k_latent_semantics]\n",
+    "  with open(filename, 'w', encoding='utf-8') as f:\n",
+    "    json.dump(k_latent_semantics, f, ensure_ascii = False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "\n",
+    "  # Load dataset\n",
+    "\n",
+    "  # User input for Image ID\n",
+    "  k = int(input(\"Enter k: \"))\n",
+    "\n",
+    "  features = ['color_moments', 'hog', 'layer3', 'avgpool', 'fc']\n",
+    "\n",
+    "  # User input for feature model to extract\n",
+    "  print(\"\\n1: Color moments\")\n",
+    "  print(\"2: HOG\")\n",
+    "  print(\"3: Resnet50 Avgpool layer\")\n",
+    "  print(\"4: Resnet50 Layer 3\")\n",
+    "  print(\"5: Resnet50 FC layer\")\n",
+    "  feature_model = features[int(input(\"Select the feature model: \")) - 1]\n",
+    "\n",
+    "  print(\"\\n1. SVD\")\n",
+    "  print(\"2. NNMF\")\n",
+    "  print(\"3. LDA\")\n",
+    "  print(\"4. k-means\")\n",
+    "  dim_reduction = int(input(\"Select the dimensionality reduction technique: \"))\n",
+    "\n",
+    "  extractKLatentSemantics(k, feature_model, dim_reduction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "   main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/2/task_3.ipynb
+++ b/2/task_3.ipynb
@@ -2,126 +2,82 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import json\n",
-    "from pymongo import MongoClient\n",
-    "from task0a import *\n",
-    "import scipy\n",
-    "import numpy as np\n",
-    "from sklearn.decomposition import NMF\n",
-    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
-    "from sklearn.cluster import KMeans\n"
+    "from utils import *\n",
+    "warnings.filterwarnings('ignore')\n",
+    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
-    "client = MongoClient()\n",
-    "client = MongoClient(host=\"localhost\", port=27017)\n",
-    "\n",
-    "# Select the database\n",
-    "db = client.Multimedia_Web_DBs\n",
-    "\n",
-    "# Fetch all documents from the collection and then sort them by \"_id\"\n",
-    "feature_descriptors = list(db.Caltech101_Feature_Descriptors.find({}))\n",
-    "feature_descriptors = sorted(list(db.Caltech101_Feature_Descriptors.find({})), key=lambda x: x[\"_id\"], reverse=False)\n",
-    "\n",
-    "num_labels = 101"
+    "fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Applying lda on the fc_fd space to get 10 latent semantics (showing only top 10 image-weight pairs for each latent semantic)...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32mc:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Phase 2\\task_3.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mk should be a positive integer\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m selected_dim_reduction_method \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m     \u001b[39minput\u001b[39m(\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mEnter dimensionality reduction method - one of \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m         \u001b[39m+\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mlist\u001b[39m(valid_dim_reduction_methods\u001b[39m.\u001b[39mkeys()))\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m     )\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m )\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m extract_latent_semantics(\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=16'>17</a>\u001b[0m     fd_collection,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=17'>18</a>\u001b[0m     k,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m     selected_feature_model,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=19'>20</a>\u001b[0m     selected_dim_reduction_method,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m     top_images\u001b[39m=\u001b[39;49m\u001b[39m10\u001b[39;49m,\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Kaushik/ASU/CSE%20515%20-%20Multimedia%20and%20Web%20Databases/Project/Phase%202/task_3.ipynb#W2sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m )\n",
+      "File \u001b[1;32mc:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Phase 2\\utils.py:674\u001b[0m, in \u001b[0;36mextract_latent_semantics\u001b[1;34m(fd_collection, k, feature_model, dim_reduction_method, top_images)\u001b[0m\n\u001b[0;32m    669\u001b[0m \u001b[39m# unsupervised LDA to extract topics (Latent Dirichlet Allocation)\u001b[39;00m\n\u001b[0;32m    670\u001b[0m \u001b[39m# Note: LDA takes a bit of time\u001b[39;00m\n\u001b[0;32m    671\u001b[0m \u001b[39mcase\u001b[39;00m \u001b[39m3\u001b[39m:\n\u001b[0;32m    672\u001b[0m     \u001b[39m# LDA requires non-negative input data\u001b[39;00m\n\u001b[0;32m    673\u001b[0m     \u001b[39m# so shift the input by subtracting the smallest value\u001b[39;00m\n\u001b[1;32m--> 674\u001b[0m     min_value \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmin(feature_vectors)\n\u001b[0;32m    675\u001b[0m     feature_vectors_shifted \u001b[39m=\u001b[39m feature_vectors \u001b[39m-\u001b[39m min_value\n\u001b[0;32m    677\u001b[0m     model \u001b[39m=\u001b[39m LatentDirichletAllocation(n_components\u001b[39m=\u001b[39mk, learning_method\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39monline\u001b[39m\u001b[39m\"\u001b[39m, verbose\u001b[39m=\u001b[39m\u001b[39m4\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\base.py:1151\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1144\u001b[0m     estimator\u001b[39m.\u001b[39m_validate_params()\n\u001b[0;32m   1146\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m   1147\u001b[0m     skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m   1148\u001b[0m         prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m   1149\u001b[0m     )\n\u001b[0;32m   1150\u001b[0m ):\n\u001b[1;32m-> 1151\u001b[0m     \u001b[39mreturn\u001b[39;00m fit_method(estimator, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:665\u001b[0m, in \u001b[0;36mLatentDirichletAllocation.fit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    663\u001b[0m \u001b[39mif\u001b[39;00m learning_method \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39monline\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[0;32m    664\u001b[0m     \u001b[39mfor\u001b[39;00m idx_slice \u001b[39min\u001b[39;00m gen_batches(n_samples, batch_size):\n\u001b[1;32m--> 665\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_em_step(\n\u001b[0;32m    666\u001b[0m             X[idx_slice, :],\n\u001b[0;32m    667\u001b[0m             total_samples\u001b[39m=\u001b[39;49mn_samples,\n\u001b[0;32m    668\u001b[0m             batch_update\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m    669\u001b[0m             parallel\u001b[39m=\u001b[39;49mparallel,\n\u001b[0;32m    670\u001b[0m         )\n\u001b[0;32m    671\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m    672\u001b[0m     \u001b[39m# batch update\u001b[39;00m\n\u001b[0;32m    673\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_em_step(\n\u001b[0;32m    674\u001b[0m         X, total_samples\u001b[39m=\u001b[39mn_samples, batch_update\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, parallel\u001b[39m=\u001b[39mparallel\n\u001b[0;32m    675\u001b[0m     )\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:524\u001b[0m, in \u001b[0;36mLatentDirichletAllocation._em_step\u001b[1;34m(self, X, total_samples, batch_update, parallel)\u001b[0m\n\u001b[0;32m    497\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"EM update for 1 iteration.\u001b[39;00m\n\u001b[0;32m    498\u001b[0m \n\u001b[0;32m    499\u001b[0m \u001b[39mupdate `_component` by batch VB or online VB.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    520\u001b[0m \u001b[39m    Unnormalized document topic distribution.\u001b[39;00m\n\u001b[0;32m    521\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    523\u001b[0m \u001b[39m# E-step\u001b[39;00m\n\u001b[1;32m--> 524\u001b[0m _, suff_stats \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_e_step(\n\u001b[0;32m    525\u001b[0m     X, cal_sstats\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, random_init\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, parallel\u001b[39m=\u001b[39;49mparallel\n\u001b[0;32m    526\u001b[0m )\n\u001b[0;32m    528\u001b[0m \u001b[39m# M-step\u001b[39;00m\n\u001b[0;32m    529\u001b[0m \u001b[39mif\u001b[39;00m batch_update:\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:467\u001b[0m, in \u001b[0;36mLatentDirichletAllocation._e_step\u001b[1;34m(self, X, cal_sstats, random_init, parallel)\u001b[0m\n\u001b[0;32m    465\u001b[0m \u001b[39mif\u001b[39;00m parallel \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m    466\u001b[0m     parallel \u001b[39m=\u001b[39m Parallel(n_jobs\u001b[39m=\u001b[39mn_jobs, verbose\u001b[39m=\u001b[39m\u001b[39mmax\u001b[39m(\u001b[39m0\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mverbose \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m))\n\u001b[1;32m--> 467\u001b[0m results \u001b[39m=\u001b[39m parallel(\n\u001b[0;32m    468\u001b[0m     delayed(_update_doc_distribution)(\n\u001b[0;32m    469\u001b[0m         X[idx_slice, :],\n\u001b[0;32m    470\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexp_dirichlet_component_,\n\u001b[0;32m    471\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdoc_topic_prior_,\n\u001b[0;32m    472\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_doc_update_iter,\n\u001b[0;32m    473\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmean_change_tol,\n\u001b[0;32m    474\u001b[0m         cal_sstats,\n\u001b[0;32m    475\u001b[0m         random_state,\n\u001b[0;32m    476\u001b[0m     )\n\u001b[0;32m    477\u001b[0m     \u001b[39mfor\u001b[39;49;00m idx_slice \u001b[39min\u001b[39;49;00m gen_even_slices(X\u001b[39m.\u001b[39;49mshape[\u001b[39m0\u001b[39;49m], n_jobs)\n\u001b[0;32m    478\u001b[0m )\n\u001b[0;32m    480\u001b[0m \u001b[39m# merge result\u001b[39;00m\n\u001b[0;32m    481\u001b[0m doc_topics, sstats_list \u001b[39m=\u001b[39m \u001b[39mzip\u001b[39m(\u001b[39m*\u001b[39mresults)\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\utils\\parallel.py:65\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m     60\u001b[0m config \u001b[39m=\u001b[39m get_config()\n\u001b[0;32m     61\u001b[0m iterable_with_config \u001b[39m=\u001b[39m (\n\u001b[0;32m     62\u001b[0m     (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m     63\u001b[0m     \u001b[39mfor\u001b[39;00m delayed_func, args, kwargs \u001b[39min\u001b[39;00m iterable\n\u001b[0;32m     64\u001b[0m )\n\u001b[1;32m---> 65\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__call__\u001b[39;49m(iterable_with_config)\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\joblib\\parallel.py:1863\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1861\u001b[0m     output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m   1862\u001b[0m     \u001b[39mnext\u001b[39m(output)\n\u001b[1;32m-> 1863\u001b[0m     \u001b[39mreturn\u001b[39;00m output \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_generator \u001b[39melse\u001b[39;00m \u001b[39mlist\u001b[39;49m(output)\n\u001b[0;32m   1865\u001b[0m \u001b[39m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m   1866\u001b[0m \u001b[39m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m   1867\u001b[0m \u001b[39m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m   1868\u001b[0m \u001b[39m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m   1869\u001b[0m \u001b[39m# callback.\u001b[39;00m\n\u001b[0;32m   1870\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_lock:\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\joblib\\parallel.py:1792\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1790\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_dispatched_batches \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m   1791\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_dispatched_tasks \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m-> 1792\u001b[0m res \u001b[39m=\u001b[39m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m   1793\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_completed_tasks \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m   1794\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprint_progress()\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\utils\\parallel.py:127\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    125\u001b[0m     config \u001b[39m=\u001b[39m {}\n\u001b[0;32m    126\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mconfig):\n\u001b[1;32m--> 127\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfunction(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
+      "File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:144\u001b[0m, in \u001b[0;36m_update_doc_distribution\u001b[1;34m(X, exp_topic_word_distr, doc_topic_prior, max_doc_update_iter, mean_change_tol, cal_sstats, random_state)\u001b[0m\n\u001b[0;32m    140\u001b[0m last_d \u001b[39m=\u001b[39m doc_topic_d\n\u001b[0;32m    142\u001b[0m \u001b[39m# The optimal phi_{dwk} is proportional to\u001b[39;00m\n\u001b[0;32m    143\u001b[0m \u001b[39m# exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).\u001b[39;00m\n\u001b[1;32m--> 144\u001b[0m norm_phi \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mdot(exp_doc_topic_d, exp_topic_word_d) \u001b[39m+\u001b[39m eps\n\u001b[0;32m    146\u001b[0m doc_topic_d \u001b[39m=\u001b[39m exp_doc_topic_d \u001b[39m*\u001b[39m np\u001b[39m.\u001b[39mdot(cnts \u001b[39m/\u001b[39m norm_phi, exp_topic_word_d\u001b[39m.\u001b[39mT)\n\u001b[0;32m    147\u001b[0m \u001b[39m# Note: adds doc_topic_prior to doc_topic_d, in-place.\u001b[39;00m\n",
+      "File \u001b[1;32m<__array_function__ internals>:180\u001b[0m, in \u001b[0;36mdot\u001b[1;34m(*args, **kwargs)\u001b[0m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
   "source": [
-    "def extractKLatentSemantics(k, feature_model, dim_reduction):\n",
+    "selected_feature_model = valid_feature_models[\n",
+    "    str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
+    "]\n",
    "\n",
-    "  feature_vectors = [x[feature_model] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "  feature_labels = [x[\"label\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
-    "  feature_ids = [x[\"_id\"] for x in feature_descriptors if x[\"_id\"] % 2 == 0]\n",
+    "k = int(input(\"Enter value of k: \"))\n",
+    "if k < 1:\n",
+    "    raise ValueError(\"k should be a positive integer\")\n",
    "\n",
-    "  filename = ''\n",
+    "selected_dim_reduction_method = str(\n",
+    "    input(\n",
+    "        \"Enter dimensionality reduction method - one of \"\n",
+    "        + str(list(valid_dim_reduction_methods.keys()))\n",
+    "    )\n",
+    ")\n",
    "\n",
-    "\n",
-    "  match dim_reduction:\n",
-    "\n",
-    "    case 1:\n",
-    "      filename = f'{feature_model}-svd-semantics.json'\n",
-    "      U, S, Vh = scipy.sparse.linalg.svds(np.array(feature_vectors), k=k)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 2:\n",
-    "      filename = f'{feature_model}-nnmf-semantics.json'\n",
-    "      model = NMF(n_components = k, init = 'random', solver = 'cd', alpha_H = 0.01, alpha_W = 0.01, max_iter = 10000)\n",
-    "      min_value = np.min(feature_vectors)\n",
-    "      feature_vectors_shifted = feature_vectors - min_value\n",
-    "      U = model.fit_transform(np.array(feature_vectors_shifted))\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 3:\n",
-    "      filename = f'{feature_model}-lda-semantics.json'\n",
-    "      U = LinearDiscriminantAnalysis(n_components = k).fit_transform(feature_vectors, feature_labels)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "\n",
-    "    case 4:\n",
-    "      filename = f'{feature_model}-kmeans-semantics.json'\n",
-    "      kmeans = KMeans(n_clusters = k)\n",
-    "      kmeans.fit(feature_vectors)\n",
-    "      U = kmeans.transform(feature_vectors)\n",
-    "      k_latent_semantics = sorted(list(zip(feature_ids, U.tolist())), key = lambda x: x[1][0], reverse = True)\n",
-    "  \n",
-    "  k_latent_semantics = [{\"_id\": item[0], \"semantics\": item[1]} for item in k_latent_semantics]\n",
-    "  with open(filename, 'w', encoding='utf-8') as f:\n",
-    "    json.dump(k_latent_semantics, f, ensure_ascii = False)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def main():\n",
-    "\n",
-    "  # Load dataset\n",
-    "\n",
-    "  # User input for Image ID\n",
-    "  k = int(input(\"Enter k: \"))\n",
-    "\n",
-    "  features = ['color_moments', 'hog', 'layer3', 'avgpool', 'fc']\n",
-    "\n",
-    "  # User input for feature model to extract\n",
-    "  print(\"\\n1: Color moments\")\n",
-    "  print(\"2: HOG\")\n",
-    "  print(\"3: Resnet50 Avgpool layer\")\n",
-    "  print(\"4: Resnet50 Layer 3\")\n",
-    "  print(\"5: Resnet50 FC layer\")\n",
-    "  feature_model = features[int(input(\"Select the feature model: \")) - 1]\n",
-    "\n",
-    "  print(\"\\n1. SVD\")\n",
-    "  print(\"2. NNMF\")\n",
-    "  print(\"3. LDA\")\n",
-    "  print(\"4. k-means\")\n",
-    "  dim_reduction = int(input(\"Select the dimensionality reduction technique: \"))\n",
-    "\n",
-    "  extractKLatentSemantics(k, feature_model, dim_reduction)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if __name__ == \"__main__\":\n",
-    "   main()"
+    "extract_latent_semantics(\n",
+    "    fd_collection,\n",
+    "    k,\n",
+    "    selected_feature_model,\n",
+    "    selected_dim_reduction_method,\n",
+    "    top_images=10,\n",
+    ")\n"
   ]
  },
  {
@@ -133,8 +89,22 @@
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
  }
 },
 "nbformat": 4,
--- a/2/utils.py
+++ b/2/utils.py
@@ -4,6 +4,11 @@ import math
 import cv2
 import numpy as np
 from scipy.stats import pearsonr
+from scipy.sparse.linalg import svds
+from sklearn.decomposition import NMF
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.cluster import KMeans

 # Torch
 import torch
@@ -12,6 +17,7 @@ from torchvision.datasets import Caltech101
 from torchvision.models import resnet50, ResNet50_Weights

 # OS and env
+import json
 from os import getenv
 from dotenv import load_dotenv
 import warnings
@@ -566,3 +572,151 @@ def show_similar_images_for_label(
            f"Plots/Label_{target_label}_{feature_model}_{distance_measure.__name__}_k{k}.png"
        )
    plt.show()
+
+
+valid_dim_reduction_methods = {
+    "svd": 1,
+    "nmf": 2,
+    "lda": 3,
+    "kmeans": 4,
+}
+
+
+def extract_latent_semantics(
+    fd_collection, k, feature_model, dim_reduction_method, top_images=None
+):
+    """
+    Extract latent semantics for entire collection at once for a given feature_model and dim_reduction_method, and display the imageID-semantic weight pairs
+
+    Leave `top_images` blank to display all imageID-weight pairs
+    """
+
+    assert (
+        feature_model in valid_feature_models.values()
+    ), "feature_model should be one of " + str(list(valid_feature_models.keys()))
+    assert (
+        dim_reduction_method in valid_dim_reduction_methods.keys()
+    ), "dim_reduction_method should be one of " + str(
+        list(valid_dim_reduction_methods.keys())
+    )
+
+    all_images = list(fd_collection.find())
+    feature_vectors = np.array([img[feature_model] for img in all_images])
+    feature_labels = [img["true_label"] for img in all_images]
+    feature_ids = [img["image_id"] for img in all_images]
+
+    top_img_str = ""
+    if top_images is not None:
+        top_img_str = f" (showing only top {top_images} image-weight pairs for each latent semantic)"
+    print(
+        "Applying {} on the {} space to get {} latent semantics{}...".format(
+            dim_reduction_method, feature_model, k, top_img_str
+        )
+    )
+
+    displayed_latent_semantics = {}
+    all_latent_semantics = {}
+
+    match valid_dim_reduction_methods[dim_reduction_method]:
+        # singular value decomposition
+        # sparse version of SVD to get only k singular values
+        case 1:
+            U, S, V_T = svds(feature_vectors, k=k)
+
+            all_latent_semantics = {
+                "image-semantic": U.tolist(),
+                "semantics-core": S.tolist(),
+                "semantic-feature": V_T.tolist(),
+            }
+
+            # for each latent semantic, sort imageID-weight pairs by weights in descending order
+            displayed_latent_semantics = [
+                sorted(
+                    list(zip(feature_ids, latent_semantic)),
+                    key=lambda x: x[1],
+                    reverse=True,
+                )[:top_images]
+                for latent_semantic in U.T
+            ]
+
+        # non-negative matrix factorization
+        case 2:
+            # NNMF requires non-negative input data
+            # so shift the input by subtracting the smallest value
+            min_value = np.min(feature_vectors)
+            feature_vectors_shifted = feature_vectors - min_value
+
+            model = NMF(
+                n_components=k,
+                init="random",
+                solver="cd",
+                alpha_H=0.01,
+                alpha_W=0.01,
+                max_iter=10000,
+            )
+            model.fit(feature_vectors_shifted)
+
+            W = model.transform(feature_vectors_shifted)
+            H = model.components_
+
+            all_latent_semantics = {"image-semantic": W, "semantic-feature": H}
+
+            # for each latent semantic, sort imageID-weight pairs by weights in descending order
+            displayed_latent_semantics = [
+                sorted(
+                    list(zip(feature_ids, latent_semantic)),
+                    key=lambda x: x[1],
+                    reverse=True,
+                )[:top_images]
+                for latent_semantic in W.T
+            ]
+
+        # unsupervised LDA to extract topics (Latent Dirichlet Allocation)
+        # Note: LDA takes a bit of time
+        case 3:
+            # LDA requires non-negative input data
+            # so shift the input by subtracting the smallest value
+            min_value = np.min(feature_vectors)
+            feature_vectors_shifted = feature_vectors - min_value
+
+            model = LatentDirichletAllocation(
+                n_components=k, learning_method="online", verbose=4
+            )
+            model.fit(feature_vectors_shifted)
+
+            # K (k x fd_dim) is the factor matrix for latent semantic-feature pairs
+            K = model.components_
+            # X (4339 x k) is the other factor matrix for image ID-latent semantic pairs
+            X = model.transform(feature_vectors_shifted)
+
+            all_latent_semantics = {"image-semantic": X, "semantic-feature": K}
+
+            # for each latent semantic, sort imageID-weight pairs by weights in descending order
+            displayed_latent_semantics = [
+                sorted(
+                    list(zip(feature_ids, latent_semantic)),
+                    key=lambda x: x[1],
+                    reverse=True,
+                )[:top_images]
+                for latent_semantic in X.T
+            ]
+
+        # k-means clustering to reduce to k clusters/dimensions
+        case 4:
+            model = KMeans(n_clusters=k).fit(feature_vectors)
+            CC = model.cluster_centers_
+            U = model.transform(feature_vectors)
+
+            all_latent_semantics = {"image-semantic": U, "semantic_feature": CC}
+
+    for idx, latent_semantic in enumerate(displayed_latent_semantics):
+        print(f"Latent semantic no. {idx}")
+        for image_id, weight in latent_semantic:
+            print(f"Image_ID\t{image_id}\t-\tWeight\t{weight}")
+
+    with open(
+        f"{feature_model}-{dim_reduction_method}-{k}-semantics.json",
+        "w",
+        encoding="utf-8",
+    ) as output_file:
+        json.dump(all_latent_semantics, output_file, ensure_ascii=False)