CSE515_MWDB_Project/Phase 3/task2.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from task2_utils import *\n",
    "warnings.filterwarnings('ignore')\n",
    "# interactive plot\n",
    "%matplotlib widget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_image_similarity(data, distance_measure):\n",
    "    \"\"\"Object-object similarity with given distance measure\"\"\"\n",
    "    n = data.shape[0]\n",
    "    image_sim_matrix = np.zeros((n, n))\n",
    "    for i in range(n):\n",
    "        for j in range(i + 1, n):\n",
    "            image_sim_matrix[i][j] = image_sim_matrix[j][i] = distance_measure(\n",
    "                np.array(data[i]), np.array(data[j])\n",
    "            )\n",
    "    return image_sim_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mds_projection(data_sim_matrix, n_components=2):\n",
    "    \"\"\"MDS projection to n-D space\"\"\"\n",
    "    n = data_sim_matrix.shape[0]\n",
    "    # Centering matrix\n",
    "    C = np.eye(n) - np.ones((n, n)) / n\n",
    "    # B = -1/2 * C * D^2 * C\n",
    "    B = -0.5 * C @ (data_sim_matrix**2) @ C\n",
    "    # Eigen decomposition\n",
    "    eigvals, eigvecs = np.linalg.eigh(B)\n",
    "\n",
    "    # Sort eigenvalues and corresponding eigenvectors\n",
    "    indices = np.argsort(eigvals)[::-1]\n",
    "    eigvals = eigvals[indices]\n",
    "    eigvecs = eigvecs[:, indices]\n",
    "\n",
    "    # Take the first n_components eigenvectors\n",
    "    components = eigvecs[:, :n_components]\n",
    "\n",
    "    return components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def avgandmin_knn_distance(data_sim_matrix, k):\n",
    "    \"\"\"Get avg. and minimum k-th nearest neighbor distance\"\"\"\n",
    "\t# Sort each row of the distance matrix and extract the kth-nearest neighbor distance\n",
    "    kth_neighbor_distances = np.sort(data_sim_matrix, axis=1)[:, k-1]\n",
    "\n",
    "    # Understanding KNN distribution to figure out strategy to find epsilon range\n",
    "    # plt.plot(np.sort(kth_neighbor_distances))\n",
    "    # plt.show()\n",
    "    \n",
    "    # Calculate the average and minimum distance of the kth-nearest neighbor\n",
    "    average_distance = np.mean(kth_neighbor_distances)\n",
    "    minimum_distance = np.min(kth_neighbor_distances)\n",
    "\n",
    "    return average_distance, minimum_distance\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_cluster_stats(clusters):\n",
    "    \"\"\"Display cluster counts and noise point count\"\"\"\n",
    "    cluster_counts = np.unique(clusters, return_counts=True)\n",
    "    cluster_counts_dict = dict(\n",
    "        (unique_label, unique_count)\n",
    "        for unique_label, unique_count in zip(cluster_counts[0], cluster_counts[1])\n",
    "    )\n",
    "    print(\"Clusters:\", cluster_counts_dict)\n",
    "    print(\"No. of clusters:\", len(cluster_counts_dict.keys() - {-1}))\n",
    "    if -1 in cluster_counts_dict:\n",
    "        print(\"Noise points:\", cluster_counts_dict[-1])\n",
    "    else:\n",
    "        print(\"No noise points\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DBSCAN:\n",
    "    def __init__(\n",
    "        self, label, data, distance_measure, eps, min_samples, data_sim_matrix=None\n",
    "    ):\n",
    "        self.label = label\n",
    "        self.eps = eps\n",
    "        self.min_samples = min_samples\n",
    "\n",
    "        self.data = data\n",
    "        self.distance_measure = distance_measure\n",
    "        self.num_images = data.shape[0]\n",
    "\n",
    "        self.image_sim_matrix = np.zeros((self.num_images, self.num_images))\n",
    "        if data_sim_matrix is not None:\n",
    "            self.image_sim_matrix = data_sim_matrix\n",
    "\n",
    "        self.clusters = np.zeros(self.num_images)  # 0 represents unclassified points\n",
    "        self.core_points = []\n",
    "\n",
    "    def dbscan(self):\n",
    "        \"\"\"DBSCAN algorithm\"\"\"\n",
    "        # if similarities not provided/calculated already\n",
    "        if np.array_equal(\n",
    "            self.image_sim_matrix, np.zeros((self.num_images, self.num_images))\n",
    "        ):\n",
    "            calculate_image_similarity(self.data, self.distance_measure)\n",
    "\n",
    "        cluster_id = 0\n",
    "        for i in range(self.num_images):\n",
    "            if self.clusters[i] != 0:\n",
    "                continue  # Skip already classified points\n",
    "\n",
    "            neighbors = self.region_query(i)\n",
    "            if len(neighbors) < self.min_samples:\n",
    "                self.clusters[i] = -1  # Mark point as noise\n",
    "            else:\n",
    "                cluster_id += 1  # New cluster identified\n",
    "                self.clusters[i] = cluster_id\n",
    "                self.grow_cluster(neighbors, cluster_id)\n",
    "\n",
    "        return self.clusters\n",
    "\n",
    "    def region_query(self, center):\n",
    "        distances = self.image_sim_matrix[center]\n",
    "        return [i for i, dist in enumerate(distances) if dist < self.eps]\n",
    "\n",
    "    def grow_cluster(self, neighbors, cluster_id):\n",
    "        i = 0\n",
    "        # check neighbors for connected components\n",
    "        while i < len(neighbors):\n",
    "            neighbor = neighbors[i]\n",
    "\n",
    "            if self.clusters[neighbor] == -1:\n",
    "                self.clusters[neighbor] = cluster_id  # Change noise to border point\n",
    "            elif self.clusters[neighbor] == 0:\n",
    "                self.clusters[neighbor] = cluster_id\n",
    "                new_neighbors = self.region_query(neighbor)\n",
    "                # If new point is a core point\n",
    "                if len(new_neighbors) >= self.min_samples:\n",
    "                    neighbors += new_neighbors  # add its neighbors to list of neighbors to consider\n",
    "            i += 1\n",
    "    \n",
    "    def get_core_points(self, label_img_ids):\n",
    "        \"\"\"Find core points (after clustering only!)\"\"\"\n",
    "        for i in range(self.num_images):\n",
    "            if self.clusters[i] == -1:\n",
    "                continue    # Skip noise points\n",
    "\n",
    "            neighbors = self.region_query(i)\n",
    "            if len(neighbors) < self.min_samples:\n",
    "                continue    # not a core point\n",
    "            else:\n",
    "                self.core_points.append(label_img_ids[i])\n",
    "\n",
    "    def mds_scatter_clusters(self):\n",
    "        \"\"\"Visualize clusters as point clouds in 2-D space\"\"\"\n",
    "        # Perform MDS projection\n",
    "        mds_components = mds_projection(self.image_sim_matrix)\n",
    "\n",
    "        # Plot clusters\n",
    "        plt.figure(figsize=(8, 6))\n",
    "        for label in set(self.clusters):\n",
    "            cluster_points = mds_components[self.clusters == label]\n",
    "            plt.scatter(\n",
    "                cluster_points[:, 0],\n",
    "                cluster_points[:, 1],\n",
    "                label=f\"{(f'Cluster {int(label)}') if label != -1 else 'Noise points'}\",\n",
    "            )\n",
    "\n",
    "        plt.title(\"DBSCAN clusters projected onto 2-D MDS space\")\n",
    "        plt.xlabel(\"MDS component 1\")\n",
    "        plt.ylabel(\"MDS component 2\")\n",
    "        plt.legend()\n",
    "        plt.savefig(f\"Plots/DBSCAN_MDS_Label_{self.label}.png\")\n",
    "        plt.show()\n",
    "\n",
    "    def group_image_clusters(self, image_data):\n",
    "        # Perform MDS projection\n",
    "        mds_components = mds_projection(self.image_sim_matrix)\n",
    "        # Scaling up to fit images inside\n",
    "        mds_components = mds_components * 10000\n",
    "\n",
    "        min_x_mds = np.min(mds_components[:, 0])\n",
    "        min_y_mds = np.min(mds_components[:, 1])\n",
    "        max_x_mds = np.max(mds_components[:, 0])\n",
    "        max_y_mds = np.max(mds_components[:, 1])\n",
    "\n",
    "        img_width = (max_x_mds - min_x_mds) / 10\n",
    "        img_height = (max_y_mds - min_y_mds) / 10\n",
    "\n",
    "        # Plot clusters\n",
    "        plt.figure(figsize=(8, 6))\n",
    "        for label in set(self.clusters):\n",
    "            cluster_points = mds_components[self.clusters == label]\n",
    "            plt.scatter(\n",
    "                cluster_points[:, 0],\n",
    "                cluster_points[:, 1],\n",
    "                label=f\"{(f'Cluster {int(label)}') if label != -1 else 'Noise points'}\",\n",
    "                zorder=1,\n",
    "            )\n",
    "\n",
    "            # Display image thumbnails at cluster centroids\n",
    "            cluster_indices = np.where(self.clusters == label)[0]\n",
    "            cluster_center = np.mean(mds_components[cluster_indices], axis=0)\n",
    "            thumbnail_data = image_data[cluster_indices[0]].resize(\n",
    "                (int(np.ceil(img_width)), int(np.ceil(img_height)))\n",
    "            )\n",
    "            im = plt.imshow(\n",
    "                thumbnail_data,\n",
    "                extent=(\n",
    "                    cluster_center[0] - 0.5 * img_width,\n",
    "                    cluster_center[0] + 0.5 * img_width,\n",
    "                    cluster_center[1] - 0.5 * img_height,\n",
    "                    cluster_center[1] + 0.5 * img_height,\n",
    "                ),\n",
    "                interpolation=\"nearest\",\n",
    "                cmap=plt.cm.gray_r,\n",
    "                zorder=0,\n",
    "            )\n",
    "\n",
    "            # Image border\n",
    "            x1, x2, y1, y2 = im.get_extent()\n",
    "            (im_border,) = plt.plot(\n",
    "                [x1, x2, x2, x1, x1],\n",
    "                [y1, y1, y2, y2, y1],\n",
    "                \"-\",\n",
    "                linewidth=2,\n",
    "                solid_capstyle=\"butt\",\n",
    "                zorder=0,\n",
    "            )\n",
    "\n",
    "            # Click to bring to front\n",
    "            def region_click(event, region_area=im, region_border=im_border):\n",
    "                if region_area.contains(event)[0]:\n",
    "                    region_border.set_zorder(2)\n",
    "                    region_area.set_zorder(2)\n",
    "                else:\n",
    "                    region_border.set_zorder(0)\n",
    "                    region_area.set_zorder(0)\n",
    "\n",
    "            im.figure.canvas.mpl_connect(\"button_press_event\", region_click)\n",
    "\n",
    "        plt.title(\"2-D MDS space with image thumbnails at centroids\")\n",
    "        plt.xlabel(\"MDS component 1\")\n",
    "        plt.ylabel(\"MDS component 2\")\n",
    "        ax = plt.gca()\n",
    "        ax.margins(0.05)\n",
    "        ax.set_aspect(0.75 / ax.get_data_ratio())\n",
    "        plt.legend()\n",
    "        plt.savefig(f\"Plots/DBSCAN_MDS_Label_{self.label}_with_images.png\")\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# selected_feature_model = valid_feature_models[\n",
    "#     str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
    "# ]\n",
    "selected_feature_model = valid_feature_models[\"avgpool\"]\n",
    "# selected_distance_measure = euclidean_distance_measure\n",
    "selected_distance_measure = feature_distance_matches[selected_feature_model]\n",
    "selected_c = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Clustering label 0 ...\n",
      "Clustering label 1 ...\n",
      "Clustering label 2 ...\n",
      "Clustering label 3 ...\n",
      "Clustering label 4 ...\n",
      "Clustering label 5 ...\n",
      "Clustering label 6 ...\n",
      "Clustering label 7 ...\n",
      "Clustering label 8 ...\n",
      "Clustering label 9 ...\n",
      "Clustering label 10 ...\n",
      "Clustering label 11 ...\n",
      "Clustering label 12 ...\n",
      "Clustering label 13 ...\n",
      "Clustering label 14 ...\n",
      "Clustering label 15 ...\n",
      "Clustering label 16 ...\n",
      "Clustering label 17 ...\n",
      "Clustering label 18 ...\n",
      "Clustering label 19 ...\n",
      "Clustering label 20 ...\n",
      "Clustering label 21 ...\n",
      "Clustering label 22 ...\n",
      "Clustering label 23 ...\n",
      "Clustering label 24 ...\n",
      "Clustering label 25 ...\n",
      "Clustering label 26 ...\n",
      "Clustering label 27 ...\n",
      "Clustering label 28 ...\n",
      "Clustering label 29 ...\n",
      "Clustering label 30 ...\n",
      "Clustering label 31 ...\n",
      "Clustering label 32 ...\n",
      "Clustering label 33 ...\n",
      "Clustering label 34 ...\n",
      "Clustering label 35 ...\n",
      "Clustering label 36 ...\n",
      "Clustering label 37 ...\n",
      "Clustering label 38 ...\n",
      "Clustering label 39 ...\n",
      "Clustering label 40 ...\n",
      "Clustering label 41 ...\n",
      "Clustering label 42 ...\n",
      "Clustering label 43 ...\n",
      "Clustering label 44 ...\n",
      "Clustering label 45 ...\n",
      "Clustering label 46 ...\n",
      "Clustering label 47 ...\n",
      "Clustering label 48 ...\n",
      "Clustering label 49 ...\n",
      "Clustering label 50 ...\n",
      "Clustering label 51 ...\n",
      "Clustering label 52 ...\n",
      "Clustering label 53 ...\n",
      "Clustering label 54 ...\n",
      "Clustering label 55 ...\n",
      "Clustering label 56 ...\n",
      "Clustering label 57 ...\n",
      "Clustering label 58 ...\n",
      "Clustering label 59 ...\n",
      "Clustering label 60 ...\n",
      "Clustering label 61 ...\n",
      "Clustering label 62 ...\n",
      "Clustering label 63 ...\n",
      "Clustering label 64 ...\n",
      "Clustering label 65 ...\n",
      "Clustering label 66 ...\n",
      "Clustering label 67 ...\n",
      "Clustering label 68 ...\n",
      "Clustering label 69 ...\n",
      "Clustering label 70 ...\n",
      "Clustering label 71 ...\n",
      "Clustering label 72 ...\n",
      "Clustering label 73 ...\n",
      "Clustering label 74 ...\n",
      "Clustering label 75 ...\n",
      "Clustering label 76 ...\n",
      "Clustering label 77 ...\n",
      "Clustering label 78 ...\n",
      "Clustering label 79 ...\n",
      "Clustering label 80 ...\n",
      "Clustering label 81 ...\n",
      "Clustering label 82 ...\n",
      "Clustering label 83 ...\n",
      "Clustering label 84 ...\n",
      "Clustering label 85 ...\n",
      "Clustering label 86 ...\n",
      "Clustering label 87 ...\n",
      "Clustering label 88 ...\n",
      "Clustering label 89 ...\n",
      "Clustering label 90 ...\n",
      "Clustering label 91 ...\n",
      "Clustering label 92 ...\n",
      "Clustering label 93 ...\n",
      "Clustering label 94 ...\n",
      "Clustering label 95 ...\n",
      "Clustering label 96 ...\n",
      "Clustering label 97 ...\n",
      "Clustering label 98 ...\n",
      "Clustering label 99 ...\n",
      "Clustering label 100 ...\n"
     ]
    }
   ],
   "source": [
    "best_models = []\n",
    "for label in range(NUM_LABELS):\n",
    "# for label in [0, 1]:\n",
    "    print(\"Clustering label\", label, \"...\")\n",
    "    # get label's images in PIL format\n",
    "    label_imgs = []\n",
    "    label_img_ids = [\n",
    "        label_img[\"image_id\"] for label_img in fd_collection.find({\"true_label\": label})\n",
    "    ]\n",
    "    for img_id in label_img_ids:\n",
    "        img, true_label = dataset[img_id]\n",
    "        label_imgs.append(transforms.ToPILImage()(img))\n",
    "\n",
    "    # get image features\n",
    "    label_fds = np.array(\n",
    "        [\n",
    "            np.array(img_fds[selected_feature_model]).flatten()\n",
    "            for img_fds in fd_collection.find({\"true_label\": label})\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    label_dbscan_results = (np.zeros(label_fds.shape[0]), 0, 0)\n",
    "    label_min_noise = label_fds.shape[0]\n",
    "    label_min_cluster_diff = np.inf\n",
    "\n",
    "    label_img_sim_matrix = calculate_image_similarity(\n",
    "        label_fds, selected_distance_measure\n",
    "    )\n",
    "\n",
    "    # decrementally try min_samples, starting from twice the desired no. of clusters\n",
    "    for cur_min_samples in range(2 * selected_c, 1, -1):\n",
    "        # find range of epsilon to try, by checking all from mean to min knn distance\n",
    "        # k is current min_samples\n",
    "        max_eps, min_eps = avgandmin_knn_distance(label_img_sim_matrix, cur_min_samples)\n",
    "\n",
    "        # try epsilon values\n",
    "        for cur_eps in np.linspace(min_eps, max_eps, num=100):\n",
    "            label_dbscan = DBSCAN(\n",
    "                label,\n",
    "                label_fds,\n",
    "                selected_distance_measure,\n",
    "                cur_eps,\n",
    "                cur_min_samples,\n",
    "                label_img_sim_matrix,\n",
    "            )\n",
    "\n",
    "            clusters = label_dbscan.dbscan()\n",
    "\n",
    "            cluster_counts = np.unique(clusters, return_counts=True)\n",
    "            cluster_counts_dict = dict(\n",
    "                (unique_label, unique_count)\n",
    "                for unique_label, unique_count in zip(\n",
    "                    cluster_counts[0], cluster_counts[1]\n",
    "                )\n",
    "            )\n",
    "\n",
    "            if cluster_counts_dict.get(-1) is not None:\n",
    "                noise_pts = cluster_counts_dict.get(-1)\n",
    "            else:\n",
    "                noise_pts = 0\n",
    "            cluster_diff = abs(len(cluster_counts_dict.keys() - {-1}) - selected_c)\n",
    "\n",
    "            # store only most desirable clustering: as close as possible to c clusters, and then minimum noise\n",
    "            if cluster_diff < label_min_cluster_diff or (\n",
    "                cluster_diff == label_min_cluster_diff and noise_pts <= label_min_noise\n",
    "            ):\n",
    "                # print(\n",
    "                #     \"Better clustering:\",\n",
    "                #     label_dbscan_results[1],\n",
    "                #     \"->\",\n",
    "                #     cur_eps,\n",
    "                #     \"\\t\",\n",
    "                #     label_dbscan_results[2],\n",
    "                #     \"->\",\n",
    "                #     cur_min_samples,\n",
    "                # )\n",
    "                # print(\n",
    "                #     \"Noise improvement:\",\n",
    "                #     label_min_noise,\n",
    "                #     \"->\",\n",
    "                #     noise_pts,\n",
    "                #     \"\\tCluster count improvement:\",\n",
    "                #     label_min_cluster_diff,\n",
    "                #     \"->\",\n",
    "                #     cluster_diff,\n",
    "                # )\n",
    "                label_dbscan_results = (clusters, cur_eps, cur_min_samples)\n",
    "                label_min_noise = noise_pts\n",
    "                label_min_cluster_diff = cluster_diff\n",
    "\n",
    "    best_label_dbscan = DBSCAN(\n",
    "        label,\n",
    "        label_fds,\n",
    "        selected_distance_measure,\n",
    "        label_dbscan_results[1],\n",
    "        label_dbscan_results[2],\n",
    "        label_img_sim_matrix,\n",
    "    )\n",
    "    best_label_dbscan.clusters = label_dbscan_results[0]\n",
    "    best_label_dbscan.get_core_points(label_img_ids)\n",
    "\n",
    "    # store best clustering\n",
    "    best_models.append(best_label_dbscan)\n",
    "\n",
    "    # # Interpretation\n",
    "    # print(\"Label:\", label)\n",
    "    # # print(\"Epsilon:\", label_dbscan_results[1], \"\\tMinPts:\", label_dbscan_results[2])\n",
    "    # display_cluster_stats(label_dbscan_results[0])\n",
    "    # print(\"Core points:\", len(best_label_dbscan.core_points))\n",
    "    # # MDS point cloud\n",
    "    # best_label_dbscan.mds_scatter_clusters()\n",
    "    # # Image thumbnail overlay\n",
    "    # best_label_dbscan.group_image_clusters(label_imgs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Visualization:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Label: 0\n",
      "Clusters: {-1.0: 94, 1.0: 15, 2.0: 94, 3.0: 3, 4.0: 4, 5.0: 8}\n",
      "No. of clusters: 5\n",
      "Noise points: 94\n",
      "Core points: 103\n",
      "Label: 1\n",
      "Clusters: {-1.0: 32, 1.0: 154, 2.0: 11, 3.0: 4, 4.0: 8, 5.0: 8}\n",
      "No. of clusters: 5\n",
      "Noise points: 32\n",
      "Core points: 106\n",
      "Label: 2\n",
      "Clusters: {-1.0: 60, 1.0: 4, 2.0: 4, 3.0: 8, 4.0: 4, 5.0: 20}\n",
      "No. of clusters: 5\n",
      "Noise points: 60\n",
      "Core points: 35\n",
      "Label: 3\n",
      "Clusters: {-1.0: 162, 1.0: 218, 2.0: 3, 3.0: 3, 4.0: 9, 5.0: 4}\n",
      "No. of clusters: 5\n",
      "Noise points: 162\n",
      "Core points: 199\n",
      "Label: 4\n",
      "Clusters: {-1.0: 16, 1.0: 9, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 16\n",
      "Core points: 12\n",
      "Label: 5\n",
      "Clusters: {-1.0: 166, 1.0: 222, 2.0: 2, 3.0: 2, 4.0: 5, 5.0: 3}\n",
      "No. of clusters: 5\n",
      "Noise points: 166\n",
      "Core points: 234\n",
      "Label: 6\n",
      "Clusters: {-1.0: 10, 1.0: 7, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 10\n",
      "Core points: 11\n",
      "Label: 7\n",
      "Clusters: {-1.0: 15, 1.0: 4, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 15\n",
      "Core points: 6\n",
      "Label: 8\n",
      "Clusters: {-1.0: 12, 1.0: 9, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 12\n",
      "Core points: 11\n",
      "Label: 9\n",
      "Clusters: {-1.0: 14, 1.0: 10, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 14\n",
      "Core points: 3\n",
      "Label: 10\n",
      "Clusters: {-1.0: 15, 1.0: 4, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 15\n",
      "Core points: 8\n",
      "Label: 11\n",
      "Clusters: {-1.0: 6, 1.0: 9, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 6\n",
      "Core points: 11\n",
      "Label: 12\n",
      "Clusters: {-1.0: 36, 1.0: 16, 2.0: 3, 3.0: 2, 4.0: 5, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 36\n",
      "Core points: 28\n",
      "Label: 13\n",
      "Clusters: {-1.0: 27, 1.0: 14, 2.0: 2, 3.0: 2, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 27\n",
      "Core points: 22\n",
      "Label: 14\n",
      "Clusters: {-1.0: 13, 1.0: 3, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 13\n",
      "Core points: 8\n",
      "Label: 15\n",
      "Clusters: {-1.0: 24, 1.0: 13, 2.0: 3, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 24\n",
      "Core points: 19\n",
      "Label: 16\n",
      "Clusters: {-1.0: 36, 1.0: 4, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 36\n",
      "Core points: 9\n",
      "Label: 17\n",
      "Clusters: {-1.0: 10, 1.0: 11, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 10\n",
      "Core points: 15\n",
      "Label: 18\n",
      "Clusters: {-1.0: 15, 1.0: 5, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 15\n",
      "Core points: 7\n",
      "Label: 19\n",
      "Clusters: {-1.0: 23, 1.0: 30, 2.0: 2, 3.0: 2, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 23\n",
      "Core points: 38\n",
      "Label: 20\n",
      "Clusters: {-1.0: 17, 1.0: 3, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 17\n",
      "Core points: 7\n",
      "Label: 21\n",
      "Clusters: {-1.0: 13, 1.0: 13, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 13\n",
      "Core points: 3\n",
      "Label: 22\n",
      "Clusters: {-1.0: 18, 1.0: 9, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 18\n",
      "Core points: 13\n",
      "Label: 23\n",
      "Clusters: {-1.0: 28, 1.0: 22, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 28\n",
      "Core points: 26\n",
      "Label: 24\n",
      "Clusters: {-1.0: 9, 1.0: 11, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 9\n",
      "Core points: 10\n",
      "Label: 25\n",
      "Clusters: {-1.0: 17, 1.0: 16, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 17\n",
      "Core points: 18\n",
      "Label: 26\n",
      "Clusters: {-1.0: 15, 1.0: 17, 2.0: 4}\n",
      "No. of clusters: 2\n",
      "Noise points: 15\n",
      "Core points: 7\n",
      "Label: 27\n",
      "Clusters: {-1.0: 21, 1.0: 2, 2.0: 10, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 21\n",
      "Core points: 14\n",
      "Label: 28\n",
      "Clusters: {-1.0: 14, 1.0: 7, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 14\n",
      "Core points: 11\n",
      "Label: 29\n",
      "Clusters: {-1.0: 11, 1.0: 2, 2.0: 8, 3.0: 3, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 11\n",
      "Core points: 15\n",
      "Label: 30\n",
      "Clusters: {-1.0: 16, 1.0: 2, 2.0: 2, 3.0: 5, 4.0: 3}\n",
      "No. of clusters: 4\n",
      "Noise points: 16\n",
      "Core points: 12\n",
      "Label: 31\n",
      "Clusters: {-1.0: 23, 1.0: 4, 2.0: 2, 3.0: 3, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 23\n",
      "Core points: 11\n",
      "Label: 32\n",
      "Clusters: {-1.0: 14, 1.0: 10, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 14\n",
      "Core points: 12\n",
      "Label: 33\n",
      "Clusters: {-1.0: 16, 1.0: 5, 2.0: 2, 3.0: 5, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 16\n",
      "Core points: 16\n",
      "Label: 34\n",
      "Clusters: {-1.0: 20, 1.0: 2, 2.0: 2, 3.0: 8, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 20\n",
      "Core points: 14\n",
      "Label: 35\n",
      "Clusters: {-1.0: 19, 1.0: 7, 2.0: 6, 3.0: 2, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 19\n",
      "Core points: 19\n",
      "Label: 36\n",
      "Clusters: {-1.0: 20, 1.0: 5, 2.0: 3, 3.0: 4}\n",
      "No. of clusters: 3\n",
      "Noise points: 20\n",
      "Core points: 3\n",
      "Label: 37\n",
      "Clusters: {-1.0: 18, 1.0: 3, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 18\n",
      "Core points: 8\n",
      "Label: 38\n",
      "Clusters: {-1.0: 27, 1.0: 2, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 27\n",
      "Core points: 5\n",
      "Label: 39\n",
      "Clusters: {-1.0: 23, 1.0: 15, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 23\n",
      "Core points: 20\n",
      "Label: 40\n",
      "Clusters: {-1.0: 22, 1.0: 4, 2.0: 2, 3.0: 5}\n",
      "No. of clusters: 3\n",
      "Noise points: 22\n",
      "Core points: 11\n",
      "Label: 41\n",
      "Clusters: {-1.0: 29, 1.0: 2, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 29\n",
      "Core points: 5\n",
      "Label: 42\n",
      "Clusters: {-1.0: 5, 1.0: 17}\n",
      "No. of clusters: 1\n",
      "Noise points: 5\n",
      "Core points: 13\n",
      "Label: 43\n",
      "Clusters: {-1.0: 10, 1.0: 5, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 10\n",
      "Core points: 7\n",
      "Label: 44\n",
      "Clusters: {-1.0: 4, 1.0: 13}\n",
      "No. of clusters: 1\n",
      "Noise points: 4\n",
      "Core points: 11\n",
      "Label: 45\n",
      "Clusters: {-1.0: 19, 1.0: 2, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 19\n",
      "Core points: 7\n",
      "Label: 46\n",
      "Clusters: {-1.0: 20, 1.0: 24, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 20\n",
      "Core points: 29\n",
      "Label: 47\n",
      "Clusters: {-1.0: 21, 1.0: 21, 2.0: 2, 3.0: 2, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 21\n",
      "Core points: 29\n",
      "Label: 48\n",
      "Clusters: {-1.0: 9, 1.0: 8, 2.0: 4}\n",
      "No. of clusters: 2\n",
      "Noise points: 9\n",
      "Core points: 3\n",
      "Label: 49\n",
      "Clusters: {-1.0: 17, 1.0: 3, 2.0: 2, 3.0: 5}\n",
      "No. of clusters: 3\n",
      "Noise points: 17\n",
      "Core points: 10\n",
      "Label: 50\n",
      "Clusters: {-1.0: 22, 1.0: 20, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 22\n",
      "Core points: 22\n",
      "Label: 51\n",
      "Clusters: {-1.0: 19, 1.0: 2, 2.0: 2, 3.0: 5, 4.0: 9, 5.0: 3}\n",
      "No. of clusters: 5\n",
      "Noise points: 19\n",
      "Core points: 21\n",
      "Label: 52\n",
      "Clusters: {-1.0: 8, 1.0: 6, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 8\n",
      "Core points: 8\n",
      "Label: 53\n",
      "Clusters: {-1.0: 13, 1.0: 17, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 13\n",
      "Core points: 19\n",
      "Label: 54\n",
      "Clusters: {-1.0: 26, 1.0: 2, 2.0: 13, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 26\n",
      "Core points: 17\n",
      "Label: 55\n",
      "Clusters: {-1.0: 30, 1.0: 2, 2.0: 19, 3.0: 4, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 30\n",
      "Core points: 27\n",
      "Label: 56\n",
      "Clusters: {-1.0: 6, 1.0: 24}\n",
      "No. of clusters: 1\n",
      "Noise points: 6\n",
      "Core points: 18\n",
      "Label: 57\n",
      "Clusters: {-1.0: 32, 1.0: 6, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 32\n",
      "Core points: 9\n",
      "Label: 58\n",
      "Clusters: {-1.0: 21, 1.0: 14, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 21\n",
      "Core points: 18\n",
      "Label: 59\n",
      "Clusters: {-1.0: 12, 1.0: 4, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 12\n",
      "Core points: 8\n",
      "Label: 60\n",
      "Clusters: {-1.0: 26, 1.0: 3, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 26\n",
      "Core points: 7\n",
      "Label: 61\n",
      "Clusters: {-1.0: 10, 1.0: 7, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 10\n",
      "Core points: 12\n",
      "Label: 62\n",
      "Clusters: {-1.0: 4, 1.0: 4, 2.0: 12}\n",
      "No. of clusters: 2\n",
      "Noise points: 4\n",
      "Core points: 12\n",
      "Label: 63\n",
      "Clusters: {-1.0: 27, 1.0: 2, 2.0: 8, 3.0: 3, 4.0: 3}\n",
      "No. of clusters: 4\n",
      "Noise points: 27\n",
      "Core points: 16\n",
      "Label: 64\n",
      "Clusters: {-1.0: 1, 1.0: 15}\n",
      "No. of clusters: 1\n",
      "Noise points: 1\n",
      "Core points: 9\n",
      "Label: 65\n",
      "Clusters: {-1.0: 24, 1.0: 5, 2.0: 2, 3.0: 3, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 24\n",
      "Core points: 14\n",
      "Label: 66\n",
      "Clusters: {-1.0: 14, 1.0: 6, 2.0: 6, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 14\n",
      "Core points: 14\n",
      "Label: 67\n",
      "Clusters: {-1.0: 9, 1.0: 3, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 9\n",
      "Core points: 8\n",
      "Label: 68\n",
      "Clusters: {-1.0: 4, 1.0: 16}\n",
      "No. of clusters: 1\n",
      "Noise points: 4\n",
      "Core points: 12\n",
      "Label: 69\n",
      "Clusters: {-1.0: 9, 1.0: 2, 2.0: 3, 3.0: 5, 4.0: 4}\n",
      "No. of clusters: 4\n",
      "Noise points: 9\n",
      "Core points: 14\n",
      "Label: 70\n",
      "Clusters: {-1.0: 3, 1.0: 16}\n",
      "No. of clusters: 1\n",
      "Noise points: 3\n",
      "Core points: 11\n",
      "Label: 71\n",
      "Clusters: {-1.0: 19, 1.0: 2, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 19\n",
      "Core points: 4\n",
      "Label: 72\n",
      "Clusters: {-1.0: 10, 1.0: 2, 2.0: 12, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 10\n",
      "Core points: 16\n",
      "Label: 73\n",
      "Clusters: {-1.0: 5, 1.0: 7, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 5\n",
      "Core points: 12\n",
      "Label: 74\n",
      "Clusters: {-1.0: 8, 1.0: 13, 2.0: 3, 3.0: 5}\n",
      "No. of clusters: 3\n",
      "Noise points: 8\n",
      "Core points: 16\n",
      "Label: 75\n",
      "Clusters: {-1.0: 34, 1.0: 3, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 34\n",
      "Core points: 7\n",
      "Label: 76\n",
      "Clusters: {-1.0: 22, 1.0: 2, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 22\n",
      "Core points: 7\n",
      "Label: 77\n",
      "Clusters: {-1.0: 14, 1.0: 9, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 14\n",
      "Core points: 11\n",
      "Label: 78\n",
      "Clusters: {-1.0: 5, 1.0: 15}\n",
      "No. of clusters: 1\n",
      "Noise points: 5\n",
      "Core points: 11\n",
      "Label: 79\n",
      "Clusters: {-1.0: 17, 1.0: 8, 2.0: 3, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 17\n",
      "Core points: 14\n",
      "Label: 80\n",
      "Clusters: {-1.0: 10, 1.0: 8, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 10\n",
      "Core points: 10\n",
      "Label: 81\n",
      "Clusters: {-1.0: 16, 1.0: 22, 2.0: 4}\n",
      "No. of clusters: 2\n",
      "Noise points: 16\n",
      "Core points: 22\n",
      "Label: 82\n",
      "Clusters: {-1.0: 14, 1.0: 7, 2.0: 3, 3.0: 2, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 14\n",
      "Core points: 14\n",
      "Label: 83\n",
      "Clusters: {-1.0: 10, 1.0: 2, 2.0: 2, 3.0: 4}\n",
      "No. of clusters: 3\n",
      "Noise points: 10\n",
      "Core points: 8\n",
      "Label: 84\n",
      "Clusters: {-1.0: 19, 1.0: 7, 2.0: 2, 3.0: 2, 4.0: 2}\n",
      "No. of clusters: 4\n",
      "Noise points: 19\n",
      "Core points: 13\n",
      "Label: 85\n",
      "Clusters: {-1.0: 6, 1.0: 11, 2.0: 5}\n",
      "No. of clusters: 2\n",
      "Noise points: 6\n",
      "Core points: 14\n",
      "Label: 86\n",
      "Clusters: {-1.0: 25, 1.0: 9, 2.0: 2, 3.0: 2, 4.0: 2, 5.0: 3}\n",
      "No. of clusters: 5\n",
      "Noise points: 25\n",
      "Core points: 18\n",
      "Label: 87\n",
      "Clusters: {-1.0: 18, 1.0: 7, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 18\n",
      "Core points: 12\n",
      "Label: 88\n",
      "Clusters: {-1.0: 21, 1.0: 7, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 21\n",
      "Core points: 11\n",
      "Label: 89\n",
      "Clusters: {-1.0: 12, 1.0: 3, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 12\n",
      "Core points: 5\n",
      "Label: 90\n",
      "Clusters: {-1.0: 25, 1.0: 3, 2.0: 7, 3.0: 2, 4.0: 2, 5.0: 4}\n",
      "No. of clusters: 5\n",
      "Noise points: 25\n",
      "Core points: 18\n",
      "Label: 91\n",
      "Clusters: {-1.0: 12, 1.0: 10, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 12\n",
      "Core points: 12\n",
      "Label: 92\n",
      "Clusters: {-1.0: 28, 1.0: 11, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 28\n",
      "Core points: 15\n",
      "Label: 93\n",
      "Clusters: {-1.0: 29, 1.0: 6, 2.0: 3}\n",
      "No. of clusters: 2\n",
      "Noise points: 29\n",
      "Core points: 9\n",
      "Label: 94\n",
      "Clusters: {-1.0: 65, 1.0: 45, 2.0: 3, 3.0: 2, 4.0: 2, 5.0: 2}\n",
      "No. of clusters: 5\n",
      "Noise points: 65\n",
      "Core points: 54\n",
      "Label: 95\n",
      "Clusters: {-1.0: 12, 1.0: 2, 2.0: 2, 3.0: 3}\n",
      "No. of clusters: 3\n",
      "Noise points: 12\n",
      "Core points: 7\n",
      "Label: 96\n",
      "Clusters: {-1.0: 22, 1.0: 5, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 22\n",
      "Core points: 7\n",
      "Label: 97\n",
      "Clusters: {-1.0: 13, 1.0: 2, 2.0: 2}\n",
      "No. of clusters: 2\n",
      "Noise points: 13\n",
      "Core points: 4\n",
      "Label: 98\n",
      "Clusters: {-1.0: 15, 1.0: 6, 2.0: 5, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 15\n",
      "Core points: 13\n",
      "Label: 99\n",
      "Clusters: {-1.0: 12, 1.0: 4, 2.0: 2, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 12\n",
      "Core points: 8\n",
      "Label: 100\n",
      "Clusters: {-1.0: 13, 1.0: 12, 2.0: 3, 3.0: 2}\n",
      "No. of clusters: 3\n",
      "Noise points: 13\n",
      "Core points: 17\n"
     ]
    }
   ],
   "source": [
    "for best_model in best_models:\n",
    "    label_imgs = []\n",
    "    label_img_ids = [\n",
    "        label_img[\"image_id\"]\n",
    "        for label_img in fd_collection.find({\"true_label\": best_model.label})\n",
    "    ]\n",
    "    for img_id in label_img_ids:\n",
    "        img, true_label = dataset[img_id]\n",
    "        label_imgs.append(transforms.ToPILImage()(img))\n",
    "    # Interpretation\n",
    "    print(\"Label:\", best_model.label)\n",
    "    # print(\"Epsilon:\", best_model.eps, \"\\tMinPts:\", best_model.min_samples)\n",
    "    display_cluster_stats(best_model.clusters)\n",
    "    print(\"Core points:\", len(best_model.core_points))\n",
    "    # MDS point cloud\n",
    "    best_model.mds_scatter_clusters()\n",
    "    # # Image thumbnail overlay\n",
    "    best_model.group_image_clusters(label_imgs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_fd_collection = getCollection(\"knravish_mwdb_phase_1\", \"fd_collection\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict label based on nearest core point\n",
    "all_core_pts = []\n",
    "for best_model in best_models:\n",
    "    all_core_pts.extend(best_model.core_points)\n",
    "all_core_pts = [\n",
    "    (x[\"image_id\"], x[\"true_label\"], np.array(x[selected_feature_model]))\n",
    "    for x in full_fd_collection.find({\"image_id\": {\"$in\": all_core_pts}})\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(898, 2, 0.04471824055950302), (998, 2, 0.04670729369738935), (896, 2, 0.05814791090290394), (886, 2, 0.06548439873670464), (970, 2, 0.06915823780454072)]\n"
     ]
    }
   ],
   "source": [
    "# all odd images\n",
    "# for img_id in range(1, 8676, 2):\n",
    "for img_id in [881]:\n",
    "    img_fd = np.array(\n",
    "        full_fd_collection.find_one({\"image_id\": img_id})[selected_feature_model]\n",
    "    )\n",
    "    distances = []\n",
    "    for core_pt in all_core_pts:\n",
    "        distances.append(\n",
    "            (\n",
    "                core_pt[0],\n",
    "                core_pt[1],\n",
    "                selected_distance_measure(\n",
    "                    core_pt[2],\n",
    "                    img_fd,\n",
    "                ),\n",
    "            )\n",
    "        )\n",
    "    print(sorted(distances, key=lambda dist: dist[2])[:selected_c])\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}