{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from task2_utils import *\n", "warnings.filterwarnings('ignore')\n", "# interactive plot\n", "%matplotlib widget" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def calculate_image_similarity(data, distance_measure):\n", " \"\"\"Object-object similarity with given distance measure\"\"\"\n", " n = data.shape[0]\n", " image_sim_matrix = np.zeros((n, n))\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " image_sim_matrix[i][j] = image_sim_matrix[j][i] = distance_measure(\n", " np.array(data[i]), np.array(data[j])\n", " )\n", " return image_sim_matrix" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def mds_projection(data_sim_matrix, n_components=2):\n", " \"\"\"MDS projection to n-D space\"\"\"\n", " n = data_sim_matrix.shape[0]\n", " # Centering matrix\n", " C = np.eye(n) - np.ones((n, n)) / n\n", " # B = -1/2 * C * D^2 * C\n", " B = -0.5 * C @ (data_sim_matrix**2) @ C\n", " # Eigen decomposition\n", " eigvals, eigvecs = np.linalg.eigh(B)\n", "\n", " # Sort eigenvalues and corresponding eigenvectors\n", " indices = np.argsort(eigvals)[::-1]\n", " eigvals = eigvals[indices]\n", " eigvecs = eigvecs[:, indices]\n", "\n", " # Take the first n_components eigenvectors\n", " components = eigvecs[:, :n_components]\n", "\n", " return components" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def avgandmin_knn_distance(data_sim_matrix, k):\n", " \"\"\"Get avg. and minimum k-th nearest neighbor distance\"\"\"\n", "\t# Sort each row of the distance matrix and extract the kth-nearest neighbor distance\n", " kth_neighbor_distances = np.sort(data_sim_matrix, axis=1)[:, k-1]\n", "\n", " # Understanding KNN distribution to figure out strategy to find epsilon range\n", " # plt.plot(np.sort(kth_neighbor_distances))\n", " # plt.show()\n", " \n", " # Calculate the average and minimum distance of the kth-nearest neighbor\n", " average_distance = np.mean(kth_neighbor_distances)\n", " minimum_distance = np.min(kth_neighbor_distances)\n", "\n", " return average_distance, minimum_distance\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "class DBSCAN:\n", " \"\"\"DBSCAN\"\"\"\n", " def __init__(\n", " self, label, data, distance_measure, eps, min_samples, data_sim_matrix=None\n", " ):\n", " self.label = label\n", " self.eps = eps\n", " self.min_samples = min_samples\n", "\n", " self.data = data\n", " self.distance_measure = distance_measure\n", " self.num_images = data.shape[0]\n", "\n", " self.image_sim_matrix = np.zeros((self.num_images, self.num_images))\n", " if data_sim_matrix is not None:\n", " self.image_sim_matrix = data_sim_matrix\n", "\n", " self.clusters = np.zeros(self.num_images) # 0 represents unclassified points\n", " self.core_points = []\n", "\n", " def dbscan(self):\n", " \"\"\"Fit DBSCAN\"\"\"\n", " # if similarities not provided/calculated already\n", " if np.array_equal(\n", " self.image_sim_matrix, np.zeros((self.num_images, self.num_images))\n", " ):\n", " calculate_image_similarity(self.data, self.distance_measure)\n", "\n", " cluster_id = 0\n", " for i in range(self.num_images):\n", " if self.clusters[i] != 0:\n", " continue # Skip already classified points\n", "\n", " neighbors = self.region_query(i)\n", " if len(neighbors) < self.min_samples:\n", " self.clusters[i] = -1 # Mark point as noise\n", " else:\n", " cluster_id += 1 # New cluster identified\n", " self.clusters[i] = cluster_id\n", " self.grow_cluster(neighbors, cluster_id)\n", "\n", " return self.clusters\n", "\n", " def region_query(self, center):\n", " \"\"\"Get neighbors within threshold\"\"\"\n", " distances = self.image_sim_matrix[center]\n", " return [i for i, dist in enumerate(distances) if dist < self.eps]\n", "\n", " def grow_cluster(self, neighbors, cluster_id):\n", " \"\"\"Assign labels to cluster and grow borders\"\"\"\n", " i = 0\n", " # check neighbors for connected components\n", " while i < len(neighbors):\n", " neighbor = neighbors[i]\n", "\n", " if self.clusters[neighbor] == -1:\n", " self.clusters[neighbor] = cluster_id # Change noise to border point\n", " elif self.clusters[neighbor] == 0:\n", " self.clusters[neighbor] = cluster_id\n", " new_neighbors = self.region_query(neighbor)\n", " # If new point is a core point\n", " if len(new_neighbors) >= self.min_samples:\n", " neighbors += new_neighbors # add its neighbors to list of neighbors to consider\n", " i += 1\n", " \n", " def get_core_points(self, label_img_ids):\n", " \"\"\"Find core points (after clustering!)\"\"\"\n", " for i in range(self.num_images):\n", " if self.clusters[i] == -1:\n", " continue # Skip noise points\n", "\n", " neighbors = self.region_query(i)\n", " if len(neighbors) < self.min_samples:\n", " continue # not a core point\n", " else:\n", " self.core_points.append(label_img_ids[i])\n", "\n", " def display_cluster_stats(self):\n", " \"\"\"Display cluster counts and noise point count (after clustering!)\"\"\"\n", " cluster_counts = np.unique(self.clusters, return_counts=True)\n", " cluster_counts_dict = dict(\n", " (unique_label, unique_count)\n", " for unique_label, unique_count in zip(cluster_counts[0], cluster_counts[1])\n", " )\n", " print(\"Clusters:\", cluster_counts_dict)\n", " print(\"No. of clusters:\", len(cluster_counts_dict.keys() - {-1}))\n", " if -1 in cluster_counts_dict:\n", " print(\"Noise points:\", cluster_counts_dict[-1])\n", " else:\n", " print(\"No noise points\")\n", "\n", " def mds_scatter_clusters(self):\n", " \"\"\"Visualize clusters as point clouds in 2-D space (after clustering!)\"\"\"\n", " # Perform MDS projection\n", " mds_components = mds_projection(self.image_sim_matrix)\n", "\n", " # Plot clusters\n", " plt.figure(figsize=(8, 6))\n", " for label in set(self.clusters):\n", " cluster_points = mds_components[self.clusters == label]\n", " plt.scatter(\n", " cluster_points[:, 0],\n", " cluster_points[:, 1],\n", " label=f\"{(f'Cluster {int(label)}') if label != -1 else 'Noise points'}\",\n", " marker=('o' if label != -1 else '*')\n", " )\n", "\n", " plt.title(\"DBSCAN clusters projected onto 2-D MDS space\")\n", " plt.xlabel(\"MDS component 1\")\n", " plt.ylabel(\"MDS component 2\")\n", " plt.legend()\n", " plt.savefig(f\"Plots/DBSCAN_MDS_Label_{self.label}.png\")\n", " plt.show()\n", "\n", " def group_image_clusters(self, image_data):\n", " \"\"\"Visualize clusters as point clouds in 2-D space, and display image thumbnails at cluster centroids (after clustering!)\"\"\"\n", " # Perform MDS projection\n", " mds_components = mds_projection(self.image_sim_matrix)\n", " # Scaling up to fit images inside\n", " mds_components = mds_components * 10000\n", "\n", " min_x_mds = np.min(mds_components[:, 0])\n", " min_y_mds = np.min(mds_components[:, 1])\n", " max_x_mds = np.max(mds_components[:, 0])\n", " max_y_mds = np.max(mds_components[:, 1])\n", "\n", " img_width = (max_x_mds - min_x_mds) / 10\n", " img_height = (max_y_mds - min_y_mds) / 10\n", "\n", " # Plot clusters\n", " plt.figure(figsize=(8, 6))\n", " for label in set(self.clusters):\n", " cluster_points = mds_components[self.clusters == label]\n", " plt.scatter(cluster_points[:, 0],\n", " cluster_points[:, 1],\n", " label=f\"{(f'Cluster {int(label)}') if label != -1 else 'Noise points'}\",\n", " marker=('o' if label != -1 else '*'),\n", " zorder=1,\n", " )\n", " \n", " if label == -1:\n", " continue\n", " # Display image thumbnails at cluster centroids\n", " cluster_indices = np.where(self.clusters == label)[0]\n", " cluster_center = np.mean(mds_components[cluster_indices], axis=0)\n", " thumbnail_data = image_data[cluster_indices[0]].resize(\n", " (int(np.ceil(img_width)), int(np.ceil(img_height)))\n", " )\n", " im = plt.imshow(\n", " thumbnail_data,\n", " extent=(\n", " cluster_center[0] - 0.5 * img_width,\n", " cluster_center[0] + 0.5 * img_width,\n", " cluster_center[1] - 0.5 * img_height,\n", " cluster_center[1] + 0.5 * img_height,\n", " ),\n", " interpolation=\"nearest\",\n", " cmap=plt.cm.gray_r,\n", " zorder=0,\n", " )\n", "\n", " # Image border\n", " x1, x2, y1, y2 = im.get_extent()\n", " (im_border,) = plt.plot(\n", " [x1, x2, x2, x1, x1],\n", " [y1, y1, y2, y2, y1],\n", " \"-\",\n", " linewidth=2,\n", " solid_capstyle=\"butt\",\n", " zorder=0,\n", " )\n", "\n", " # Click to bring to front\n", " def region_click(event, region_area=im, region_border=im_border):\n", " if region_area.contains(event)[0]:\n", " region_border.set_zorder(2)\n", " region_area.set_zorder(2)\n", " else:\n", " region_border.set_zorder(0)\n", " region_area.set_zorder(0)\n", "\n", " im.figure.canvas.mpl_connect(\"button_press_event\", region_click)\n", "\n", " plt.title(\"2-D MDS space with image thumbnails at centroids\")\n", " plt.xlabel(\"MDS component 1\")\n", " plt.ylabel(\"MDS component 2\")\n", " ax = plt.gca()\n", " ax.margins(0.05)\n", " ax.set_aspect(0.75 / ax.get_data_ratio())\n", " plt.legend()\n", " plt.savefig(f\"Plots/DBSCAN_MDS_Label_{self.label}_with_images.png\")\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# selected_feature_model = valid_feature_models[\n", "# str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n", "# ]\n", "selected_feature_model = valid_feature_models[\"avgpool\"]\n", "selected_distance_measure = euclidean_distance_measure\n", "# selected_distance_measure = feature_distance_matches[selected_feature_model]\n", "selected_c = 10" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Clustering label 100 ...\r" ] } ], "source": [ "best_models = []\n", "for label in range(NUM_LABELS):\n", "# for label in [0, 1]:\n", " print(\"Clustering label\", label, \"...\", end=\"\\r\")\n", "\n", " # get image features\n", " label_ids = [\n", " label_img[\"image_id\"] for label_img in fd_collection.find({\"true_label\": label})\n", " ]\n", " label_fds = np.array(\n", " [\n", " np.array(img_fds[selected_feature_model]).flatten()\n", " for img_fds in fd_collection.find({\"true_label\": label})\n", " ]\n", " )\n", "\n", " label_dbscan_results = (np.zeros(label_fds.shape[0]), 0, 0)\n", " label_min_noise = label_fds.shape[0]\n", " label_min_cluster_diff = np.inf\n", "\n", " label_img_sim_matrix = calculate_image_similarity(\n", " label_fds, selected_distance_measure\n", " )\n", "\n", " # decrementally try min_samples, starting from twice the desired no. of clusters\n", " for cur_min_samples in range(selected_c, 1, -1):\n", " # find range of epsilon to try, by checking all from mean to min knn distance\n", " # k is current min_samples\n", " max_eps, min_eps = avgandmin_knn_distance(label_img_sim_matrix, cur_min_samples)\n", "\n", " # try epsilon values\n", " for cur_eps in np.linspace(min_eps, max_eps, num=100):\n", " label_dbscan = DBSCAN(\n", " label,\n", " label_fds,\n", " selected_distance_measure,\n", " cur_eps,\n", " cur_min_samples,\n", " label_img_sim_matrix,\n", " )\n", "\n", " clusters = label_dbscan.dbscan()\n", "\n", " cluster_counts = np.unique(clusters, return_counts=True)\n", " cluster_counts_dict = dict(\n", " (unique_label, unique_count)\n", " for unique_label, unique_count in zip(\n", " cluster_counts[0], cluster_counts[1]\n", " )\n", " )\n", "\n", " if cluster_counts_dict.get(-1) is not None:\n", " noise_pts = cluster_counts_dict.get(-1)\n", " else:\n", " noise_pts = 0\n", " cluster_diff = abs(len(cluster_counts_dict.keys() - {-1}) - selected_c)\n", "\n", " # store only most desirable clustering: as close as possible to c clusters, and then minimum noise\n", " if cluster_diff < label_min_cluster_diff or (\n", " cluster_diff == label_min_cluster_diff and noise_pts <= label_min_noise\n", " ):\n", " # print(\n", " # \"Better clustering:\",\n", " # label_dbscan_results[1],\n", " # \"->\",\n", " # cur_eps,\n", " # \"\\t\",\n", " # label_dbscan_results[2],\n", " # \"->\",\n", " # cur_min_samples,\n", " # )\n", " # print(\n", " # \"Noise improvement:\",\n", " # label_min_noise,\n", " # \"->\",\n", " # noise_pts,\n", " # \"\\tCluster count improvement:\",\n", " # label_min_cluster_diff,\n", " # \"->\",\n", " # cluster_diff,\n", " # )\n", " label_dbscan_results = (clusters, cur_eps, cur_min_samples)\n", " label_min_noise = noise_pts\n", " label_min_cluster_diff = cluster_diff\n", "\n", " best_label_dbscan = DBSCAN(\n", " label,\n", " label_fds,\n", " selected_distance_measure,\n", " label_dbscan_results[1],\n", " label_dbscan_results[2],\n", " label_img_sim_matrix,\n", " )\n", " best_label_dbscan.clusters = label_dbscan_results[0]\n", " best_label_dbscan.get_core_points(label_ids)\n", "\n", " # store best clustering\n", " best_models.append(best_label_dbscan)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Interpretation" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Label: 0\n", "Clusters: {-1.0: 138, 1.0: 2, 2.0: 2, 3.0: 3, 4.0: 46, 5.0: 3, 6.0: 4, 7.0: 4, 8.0: 5, 9.0: 3, 10.0: 8}\n", "No. of clusters: 10\n", "Noise points: 138\n", "Core points: 80\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "27e8475ee1334628b9dda022fceeae47", "version_major": 2, "version_minor": 0 }, "image/png": "", "text/html": [ "\n", "