task 3 done

2026-01-25 06:14:04 +00:00 · 2023-09-10 20:06:21 -07:00
parent 34d51b28d9
commit e26108bc3d
24 changed files with 5268 additions and 602 deletions
--- a/1/2.ipynb
+++ b/1/2.ipynb
@@ -1,378 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient\n",
    "\n",
    "# Connect to local MongoDB database\n",
    "client = MongoClient(\"mongodb://localhost:27017\")\n",
    "\n",
    "db = client[\"knravish_mwdb_phase_1\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create/access feature descriptor collection\n",
    "fd_collection = db[\"fd_collection\"]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (Task 1's code without visualization)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import numpy as np\n",
    "from scipy.stats import skew\n",
    "\n",
    "import torch\n",
    "import torchvision.transforms as transforms\n",
    "\n",
    "import torchvision.datasets as datasets\n",
    "\n",
    "dataset_path = \"C:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Datasets\"\n",
    "\n",
    "dataset = datasets.Caltech101(\n",
    "    root=dataset_path,\n",
    "    download=False,  # True if you wish to download for first time\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Class transform to partition image into rows x cols grid\n",
    "\n",
    "\n",
    "class GridPartition:\n",
    "    def __init__(self, rows, cols):\n",
    "        self.rows = rows\n",
    "        self.cols = cols\n",
    "\n",
    "    def __call__(self, img):\n",
    "        img_width, img_height = img.size\n",
    "        cell_width = img_width // self.cols\n",
    "        cell_height = img_height // self.rows\n",
    "\n",
    "        grids = []\n",
    "        for i in range(self.rows):\n",
    "            for j in range(self.cols):\n",
    "                left = j * cell_width\n",
    "                top = i * cell_height\n",
    "                right = left + cell_width\n",
    "                bottom = top + cell_height\n",
    "                grid = img.crop((left, top, right, bottom))\n",
    "                grids.append(grid)\n",
    "\n",
    "        return grids\n",
    "\n",
    "\n",
    "def compute_color_moments(image):\n",
    "    image = np.array(image)  # Convert PIL Image to NumPy array\n",
    "    moments = []\n",
    "\n",
    "    for channel in range(3):  # Iterate over RGB channels\n",
    "        channel_data = image[:, :, channel]\n",
    "        mean = np.mean(channel_data)\n",
    "        std_dev = np.std(channel_data)\n",
    "        skewness = skew(channel_data, axis=None)\n",
    "        moments.append([mean, std_dev, skewness])\n",
    "\n",
    "    return moments\n",
    "\n",
    "\n",
    "# Iterate over grid cells and return as 1-d array for easier resizing by torch\n",
    "def compute_color_moments_for_grid(grid):\n",
    "    color_moments = [compute_color_moments(grid_cell) for grid_cell in grid]\n",
    "    return np.array(color_moments).flatten()\n",
    "\n",
    "\n",
    "def combine_color_moments(grid_color_moments):\n",
    "    return torch.Tensor(grid_color_moments).view(\n",
    "        10, 10, 3, 3\n",
    "    )  # resize as needed: 10x10 grid, 3 channels per cell, 3 moments per channel\n",
    "\n",
    "\n",
    "CM_transform = transforms.Compose(\n",
    "    [\n",
    "        transforms.Resize((100, 300)),  # resize to H:W=100:300\n",
    "        GridPartition(\n",
    "            rows=10, cols=10\n",
    "        ),  # partition into grid of 10 rows, 10 columns as a list\n",
    "        compute_color_moments_for_grid,\n",
    "        combine_color_moments,\n",
    "    ]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_gradient_histogram(grid_cell):\n",
    "    histograms = []\n",
    "\n",
    "    # Convert grid cell to NumPy array\n",
    "    grid_array = np.array(grid_cell, dtype=np.uint8)\n",
    "\n",
    "    # Compute the gradient using first-order central differences\n",
    "    dx = cv2.Sobel(\n",
    "        grid_array, cv2.CV_32F, dx=1, dy=0, ksize=1\n",
    "    )  # first order x derivative = [-1, 0, 1]\n",
    "    dy = cv2.Sobel(\n",
    "        grid_array, cv2.CV_32F, dx=0, dy=1, ksize=1\n",
    "    )  # first order y derivative = [-1, 0, 1]^T\n",
    "\n",
    "    # Compute magnitude and direction of gradients\n",
    "    magnitude = np.sqrt(dx**2 + dy**2)\n",
    "    direction = np.arctan2(dy, dx) * 180 / np.pi  # in degrees\n",
    "\n",
    "    # Compute HOG - 9 bins, counted across the range of -180 to 180 degrees, weighted by gradient magnitude\n",
    "    histogram, _ = np.histogram(direction, bins=9, range=(-180, 180), weights=magnitude)\n",
    "\n",
    "    histograms.append(histogram)\n",
    "\n",
    "    return histograms\n",
    "\n",
    "\n",
    "def compute_histograms_for_grid(grid):\n",
    "    histograms = [compute_gradient_histogram(grid_cell) for grid_cell in grid]\n",
    "    return np.array(histograms).flatten()\n",
    "\n",
    "\n",
    "def combine_histograms(grid_histograms):\n",
    "    return torch.Tensor(grid_histograms).view(10, 10, 9)\n",
    "\n",
    "\n",
    "HOG_transform = transforms.Compose(\n",
    "    [\n",
    "        transforms.Grayscale(num_output_channels=1),  # grayscale transform\n",
    "        transforms.Resize((100, 300)),  # resize to H:W=100:300\n",
    "        GridPartition(\n",
    "            rows=10, cols=10\n",
    "        ),  # partition into grid of 10 rows, 10 columns as a list\n",
    "        compute_histograms_for_grid,\n",
    "        combine_histograms,\n",
    "    ]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torchvision.models as models\n",
    "\n",
    "# Load model\n",
    "model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)\n",
    "\n",
    "# use GPU (Nvidia)\n",
    "if torch.cuda.is_available():\n",
    "    dev = torch.device(\"cuda\")\n",
    "    torch.cuda.empty_cache()\n",
    "else:\n",
    "    dev = torch.device(\"cpu\")\n",
    "\n",
    "model = model.to(dev)\n",
    "\n",
    "\n",
    "# Feature extractor for all layers at once\n",
    "\n",
    "\n",
    "class FeatureExtractor(torch.nn.Module):\n",
    "    def __init__(self, model, layers):\n",
    "        super().__init__()\n",
    "        self.model = model\n",
    "        self.layers = layers\n",
    "        self._features = {layer: None for layer in layers}  # store layer outputs here\n",
    "\n",
    "        # Create hooks for all specified layers at once\n",
    "        for layer_id in layers:\n",
    "            layer = dict(self.model.named_modules())[layer_id]  # get actual layer in the model\n",
    "            layer.register_forward_hook(self.save_outputs_hook(layer_id))  # register feature extractor hook on layer\n",
    "\n",
    "    # Hook to save output of layer\n",
    "    def save_outputs_hook(self, layer_id):\n",
    "        def fn(_module, _input, output):\n",
    "            self._features[layer_id] = output\n",
    "\n",
    "        return fn\n",
    "\n",
    "    # Forward pass returns extracted features\n",
    "    def forward(self, input):\n",
    "        _ = self.model(input)\n",
    "        return self._features\n",
    "\n",
    "\n",
    "def resnet_extractor(image, img_channels):\n",
    "    # ResNet50 expects 3 channel image\n",
    "    if img_channels != 3:\n",
    "        return (None, None, None)\n",
    "\n",
    "    resized_image = (\n",
    "        torch.Tensor(np.array(transforms.Resize((224, 224))(image)).flatten())\n",
    "        .view(1, 3, 224, 224)\n",
    "        .to(dev)\n",
    "    )\n",
    "\n",
    "    # Attach all hooks on model and extract features\n",
    "    resnet_features = FeatureExtractor(model=model, layers=[\"avgpool\", \"layer3\", \"fc\"])\n",
    "    features = resnet_features(resized_image)\n",
    "\n",
    "    avgpool_2048 = features[\"avgpool\"]\n",
    "    # Reshape the vector into row pairs of elements and average across rows\n",
    "    avgpool_1024_fd = torch.mean(avgpool_2048.view(-1, 2), axis=1)\n",
    "\n",
    "    layer3_1024_14_14 = features[\"layer3\"]\n",
    "    # Reshape the vector into 1024 rows of 196 elements and average across rows\n",
    "    layer3_1024_fd = torch.mean(layer3_1024_14_14.view(1024, -1), axis=1)\n",
    "\n",
    "    fc_1000_fd = features[\"fc\"].view(1000)\n",
    "\n",
    "    return (\n",
    "        avgpool_1024_fd.detach().cpu().tolist(),\n",
    "        layer3_1024_fd.detach().cpu().tolist(),\n",
    "        fc_1000_fd.detach().cpu().tolist(),\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process all images and store in collection (one-time processing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\rknar\\AppData\\Local\\Temp\\ipykernel_8384\\3604855272.py:35: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.\n",
      "  skewness = skew(channel_data, axis=None)\n"
     ]
    }
   ],
   "source": [
    "start = 0\n",
    "stop = len(dataset)\n",
    "step = 1\n",
    "\n",
    "for idx in range(start, stop, step):\n",
    "    img, label = dataset[idx]\n",
    "\n",
    "    img_shape = np.array(img).shape\n",
    "\n",
    "    if len(img_shape) >= 3 and img_shape[2] >= 3:\n",
    "        cm_fd = CM_transform(img).tolist()\n",
    "        img_channels = 3\n",
    "    else:\n",
    "        # no color moments for grayscale images\n",
    "        # TODO: perhaps we could do conversion by stacking channels? or is there some grayscale-to-RGB function?\n",
    "        cm_fd = None\n",
    "        img_channels = 1\n",
    "\n",
    "    hog_fd = HOG_transform(img).tolist()\n",
    "    avgpool_1024_fd, layer3_1024_fd, fc_1000_fd = resnet_extractor(img, img_channels)\n",
    "\n",
    "    # Store to collection\n",
    "    fd_collection.insert_one(\n",
    "        {\n",
    "            \"image_id\": idx,\n",
    "            \"true_label\": label,\n",
    "            \"channels\": img_channels,\n",
    "            \"cm_fd\": cm_fd,\n",
    "            \"hog_fd\": hog_fd,\n",
    "            \"avgpool_fd\": avgpool_1024_fd,\n",
    "            \"layer3_fd\": layer3_1024_fd,\n",
    "            \"fc_fd\": fc_1000_fd,\n",
    "        }\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1581"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove duplicates (accidental re-runs)\n",
    "distinct_values = fd_collection.distinct(\"image_id\")\n",
    "\n",
    "for fieldValue in distinct_values:\n",
    "    i = 0\n",
    "    for doc in fd_collection.find({\"image_id\": fieldValue}):\n",
    "        if i:\n",
    "            fd_collection.delete_one({\"_id\": doc[\"_id\"]})\n",
    "        i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/1/3.ipynb
+++ b/1/3.ipynb
--- a/1/Plots/Image_0_avgpool_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_0_avgpool_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_0_cm_pearson_distance_measure_k10.png
+++ b/1/Plots/Image_0_cm_pearson_distance_measure_k10.png
--- a/1/Plots/Image_0_fc_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_0_fc_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_0_hog_cosine_distance_measure_k10.png
+++ b/1/Plots/Image_0_hog_cosine_distance_measure_k10.png
--- a/1/Plots/Image_0_layer3_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_0_layer3_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_123_cm_pearson_distance_measure_k10.png
+++ b/1/Plots/Image_123_cm_pearson_distance_measure_k10.png
--- a/1/Plots/Image_2500_avgpool_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_2500_avgpool_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_2500_cm_pearson_distance_measure_k10.png
+++ b/1/Plots/Image_2500_cm_pearson_distance_measure_k10.png
--- a/1/Plots/Image_2500_fc_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_2500_fc_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_2500_hog_cosine_distance_measure_k10.png
+++ b/1/Plots/Image_2500_hog_cosine_distance_measure_k10.png
--- a/1/Plots/Image_2500_layer3_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_2500_layer3_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_5122_hog_cosine_distance_measure_k10.png
+++ b/1/Plots/Image_5122_hog_cosine_distance_measure_k10.png
--- a/1/Plots/Image_8676_hog_cosine_distance_measure_k10.png
+++ b/1/Plots/Image_8676_hog_cosine_distance_measure_k10.png
--- a/1/Plots/Image_880_avgpool_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_880_avgpool_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_880_cm_pearson_distance_measure_k10.png
+++ b/1/Plots/Image_880_cm_pearson_distance_measure_k10.png
--- a/1/Plots/Image_880_fc_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_880_fc_euclidean_distance_measure_k10.png
--- a/1/Plots/Image_880_hog_cosine_distance_measure_k10.png
+++ b/1/Plots/Image_880_hog_cosine_distance_measure_k10.png
--- a/1/Plots/Image_880_layer3_euclidean_distance_measure_k10.png
+++ b/1/Plots/Image_880_layer3_euclidean_distance_measure_k10.png
--- a/1/README.md
+++ b/1/README.md
@@ -0,0 +1,9 @@
 # Phase 1
 Getting started: PyTorch, Caltech101 dataset, ResNet50 and similarity measures
 - Refer phase1_project23.pdf for problem description
 - For task 3, the best distance measures seem to be:
  - Color moments - Pearson (faces especially)
  - Histogram of oriented gradients (HOG) - Cosine similarity
  - ResNet50 (avgpool, layer3, fc) - unsatisfactory results for all, simply used euclidean
--- a/1/task_1.ipynb
+++ b/1/task_1.ipynb
--- a/1/task_2.ipynb
+++ b/1/task_2.ipynb
--- a/1/task_3.ipynb
+++ b/1/task_3.ipynb