CSE515_MWDB_Project/Phase 2/task_10.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import *\n",
    "warnings.filterwarnings('ignore')\n",
    "%matplotlib inline\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")\n",
    "all_images = fd_collection.find()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_latent_space = valid_latent_spaces[\n",
    "    str(input(\"Enter latent space - one of \" + str(list(valid_latent_spaces.keys()))))\n",
    "]\n",
    "\n",
    "selected_feature_model = valid_feature_models[\n",
    "    str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
    "]\n",
    "\n",
    "k = int(input(\"Enter value of k (no. of latent semantics): \"))\n",
    "if k < 1:\n",
    "    raise ValueError(\"k should be a positive integer\")\n",
    "\n",
    "k_2 = int(input(\"Enter value of k_2 (no. of similar images): \"))\n",
    "if k_2 < 1:\n",
    "    raise ValueError(\"k_2 should be a positive integer\")\n",
    "\n",
    "if selected_latent_space != \"cp\":\n",
    "    selected_dim_reduction_method = str(\n",
    "        input(\n",
    "            \"Enter dimensionality reduction method - one of \"\n",
    "            + str(list(valid_dim_reduction_methods.keys()))\n",
    "        )\n",
    "    )\n",
    "\n",
    "label = int(input(\"Enter label: \"))\n",
    "if label < 0 and label > 100:\n",
    "    raise ValueError(\"label should be between 0 and 100\")\n",
    "\n",
    "label_rep = calculate_label_representatives(\n",
    "    fd_collection, label, selected_feature_model\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cm_fd-svd-10-semantics.json loaded\n"
     ]
    }
   ],
   "source": [
    "# Loading latent semantics\n",
    "match selected_latent_space:\n",
    "    # LS1\n",
    "    case \"\":\n",
    "        file_prefix = f\"{selected_feature_model}-{selected_dim_reduction_method}-{k}\"\n",
    "        file_name = file_prefix + \"-semantics.json\"\n",
    "        model_name = file_prefix + \"-model.joblib\"\n",
    "        if os.path.exists(file_name):\n",
    "            data = json.load(open(file_name))\n",
    "            print(file_name + \" loaded\")\n",
    "        else:\n",
    "            raise Exception(file_name + \" does not exist\")\n",
    "        # LDA model\n",
    "        if selected_dim_reduction_method == \"lda\":\n",
    "            if os.path.exists(model_name):\n",
    "                data_model = load(model_name)\n",
    "                print(model_name + \" loaded\")\n",
    "            else:\n",
    "                raise Exception(model_name + \" does not exist\")\n",
    "    # LS2\n",
    "    case \"cp\":\n",
    "        file_name = f\"{selected_feature_model}-cp-{k}-semantics.json\"\n",
    "        if os.path.exists(file_name):\n",
    "            data = json.load(open(file_name))\n",
    "            print(file_name + \" loaded\")\n",
    "        else:\n",
    "            raise Exception(file_name + \" does not exist\")\n",
    "    # LS3, LS4\n",
    "    case _:\n",
    "        file_name = f\"{selected_latent_space}-{selected_feature_model}-{selected_dim_reduction_method}-{k}-semantics.json\"\n",
    "        if os.path.exists(file_name):\n",
    "            data = json.load(open(file_name))\n",
    "            print(file_name + \" loaded\")\n",
    "        else:\n",
    "            raise Exception(file_name + \" does not exist\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_similarities_ls1_ls4(latent_space, dim_reduction, data, label, label_rep):\n",
    "\n",
    "    match dim_reduction:\n",
    "\n",
    "        case \"svd\":\n",
    "            U = np.array(data[\"image-semantic\"])\n",
    "            S = np.array(data[\"semantics-core\"])\n",
    "            if len(S.shape) == 1:\n",
    "                S = np.diag(S)\n",
    "            V = np.transpose(np.array(data[\"semantic-feature\"]))\n",
    "\n",
    "            if latent_space == \"image_sim\":\n",
    "                label_vectors = []\n",
    "                length = len(U)\n",
    "                for i in range(length):\n",
    "                    if all_images[i][\"true_label\"] == label:\n",
    "                        label_vectors.append(U[i])\n",
    "                label_rep = [sum(col) / len(col) for col in zip(*label_vectors)]\n",
    "                comparison_vector = np.matmul(label_rep, S)\n",
    "            else:\n",
    "                comparison_vector = np.matmul(np.matmul(label_rep, V), S)\n",
    "\n",
    "            comparison_feature_space = np.matmul(U, S)\n",
    "\n",
    "        case \"nmf\":\n",
    "            H = np.array(data[\"semantic-feature\"])\n",
    "            comparison_feature_space = W = np.array(data[\"image-semantic\"])\n",
    "            if latent_space == \"image_sim\":\n",
    "                label_vectors = []\n",
    "                length = len(W)\n",
    "                for i in range(length):\n",
    "                    if all_images[i][\"true_label\"] == label:\n",
    "                        label_vectors.append(W[i])\n",
    "                label_rep = [sum(col) / len(col) for col in zip(*label_vectors)]\n",
    "                comparison_vector = label_rep\n",
    "            else:\n",
    "                min_value = np.min(label_rep)\n",
    "                feature_vectors_shifted = label_rep - min_value\n",
    "                comparison_vector = nmf(feature_vectors_shifted, H, update_H=False)\n",
    "\n",
    "        case \"kmeans\":\n",
    "            comparison_vector = []\n",
    "            comparison_feature_space = np.array(data[\"image-semantic\"])\n",
    "            S = np.array(data[\"semantic-feature\"])\n",
    "\n",
    "            if latent_space == \"image_sim\":\n",
    "                sim_matrix = np.array(data[\"sim-matrix\"])\n",
    "                label_vectors = []\n",
    "                length = len(sim_matrix)\n",
    "                for i in range(length):\n",
    "                    if all_images[i][\"true_label\"] == label:\n",
    "                        label_vectors.append(sim_matrix[i])\n",
    "                label_rep = [sum(col) / len(col) for col in zip(*label_vectors)]\n",
    "\n",
    "            # get label_rep's kmeans semantic\n",
    "            for centroid in S:\n",
    "                comparison_vector.append(math.dist(label_rep, centroid))\n",
    "\n",
    "        case \"lda\":\n",
    "\n",
    "            comparison_feature_space = np.array(data[\"image-semantic\"])\n",
    "            if latent_space == \"image_sim\":\n",
    "                label_vectors = []\n",
    "                length = len(comparison_feature_space)\n",
    "                for i in range(length):\n",
    "                    if all_images[i][\"true_label\"] == label:\n",
    "                        label_vectors.append(comparison_feature_space[i])\n",
    "                label_rep = [sum(col) / len(col) for col in zip(*label_vectors)]\n",
    "                comparison_vector = label_rep\n",
    "            else:\n",
    "                min_value = np.min(label_rep)\n",
    "                feature_vectors_shifted = label_rep - min_value\n",
    "                comparison_vector = data_model.transform(\n",
    "                    feature_vectors_shifted.flatten().reshape(1, -1)\n",
    "                ).flatten()\n",
    "\n",
    "    distances = []\n",
    "    for i in range(NUM_IMAGES):\n",
    "        distances.append(\n",
    "            {\n",
    "                \"image_id\": i,\n",
    "                \"label\": all_images[i][\"true_label\"],\n",
    "                \"distance\": math.dist(comparison_vector, comparison_feature_space[i]),\n",
    "            }\n",
    "        )\n",
    "\n",
    "    distances = sorted(distances, key=lambda x: x[\"distance\"], reverse=False)[:k_2]\n",
    "\n",
    "    for x in distances:\n",
    "        print(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_similarities_ls2(data, label):\n",
    "\n",
    "    LS_f = np.array(data[\"feature-semantic\"])\n",
    "    LS_i = np.array(data[\"image-semantic\"])\n",
    "    S = np.array(data[\"semantics-core\"])\n",
    "\n",
    "    if len(S.shape) == 1:\n",
    "        S = np.diag(S)\n",
    "\n",
    "    comparison_feature_space = np.matmul(label_rep, LS_f)\n",
    "    comparison_vector = np.matmul(comparison_feature_space, S)\n",
    "\n",
    "    comparison_image_space = np.matmul(LS_i, S)\n",
    "    distances = []\n",
    "\n",
    "    for i in range(NUM_IMAGES):\n",
    "        distances.append(\n",
    "            {\n",
    "                \"image\": i,\n",
    "                \"distance\": math.dist(comparison_vector, comparison_image_space[i]),\n",
    "            }\n",
    "        )\n",
    "\n",
    "    distances = sorted(distances, key=lambda x: x[\"distance\"], reverse=False)[:k_2]\n",
    "\n",
    "    for x in distances:\n",
    "        print(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_similarities_ls3(dim_reduction, data, label):\n",
    "\n",
    "    if dim_reduction == \"svd\":\n",
    "        U = np.array(data[\"image-semantic\"])\n",
    "        S = np.array(data[\"semantics-core\"])\n",
    "        V = np.transpose(np.array(data[\"semantic-feature\"]))\n",
    "\n",
    "        comparison_feature_space = np.matmul(U, S)\n",
    "    else:\n",
    "        comparison_feature_space = np.array(data[\"image-semantic\"])\n",
    "\n",
    "    comparison_vector = comparison_feature_space[label]\n",
    "\n",
    "    distances = []\n",
    "    for i in range(NUM_LABELS):\n",
    "        if i != label:\n",
    "            distances.append(\n",
    "                {\n",
    "                    \"label\": i,\n",
    "                    \"distance\": math.dist(\n",
    "                        comparison_vector, comparison_feature_space[i]\n",
    "                    ),\n",
    "                }\n",
    "            )\n",
    "    \n",
    "    most_similar_label = sorted(distances, key=lambda x: x[\"distance\"], reverse=False)[0]\n",
    "    print(f\"Most similar label is {most_similar_label}\")\n",
    "\n",
    "    similar_images = []\n",
    "    for i in range(len(dataset)):\n",
    "        _, l = dataset[i]\n",
    "        if l == label:\n",
    "            similar_images.append(i)\n",
    "\n",
    "    similar_images = random.sample(similar_images, k_2)\n",
    "    images_distances = []\n",
    "    for i in similar_images:\n",
    "        images_distances.append(\n",
    "            {\"image_id\": i, \"distance\": most_similar_label[\"distance\"]}\n",
    "        )\n",
    "\n",
    "    for x in images_distances:\n",
    "        print(x)\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'image_id': 499, 'label': 2, 'distance': 0.5891843615223927}\n",
      "{'image_id': 449, 'label': 2, 'distance': 0.6183329800988425}\n",
      "{'image_id': 462, 'label': 2, 'distance': 0.7954630378173778}\n",
      "{'image_id': 512, 'label': 2, 'distance': 0.8431996693479317}\n",
      "{'image_id': 506, 'label': 2, 'distance': 0.8541263603745314}\n",
      "{'image_id': 438, 'label': 2, 'distance': 0.9166483319951415}\n",
      "{'image_id': 491, 'label': 2, 'distance': 0.9340236427529084}\n",
      "{'image_id': 527, 'label': 2, 'distance': 0.9349318595824383}\n",
      "{'image_id': 441, 'label': 2, 'distance': 0.9351164972683086}\n",
      "{'image_id': 490, 'label': 2, 'distance': 0.9440402757056761}\n"
     ]
    }
   ],
   "source": [
    "match selected_latent_space:\n",
    "\n",
    "  case \"\" | \"image_sim\":\n",
    "    \n",
    "    extract_similarities_ls1_ls4(selected_latent_space, selected_dim_reduction_method, data, label, label_rep)\n",
    "\n",
    "  case \"label_sim\":\n",
    "\n",
    "    extract_similarities_ls3(selected_dim_reduction_method, data, label)\n",
    "\n",
    "  case \"cp\":\n",
    "\n",
    "    extract_similarities_ls2(data, label)\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}