{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pymongo import MongoClient\n", "\n", "# Connect to local MongoDB database\n", "client = MongoClient(\"mongodb://localhost:27017\")\n", "\n", "db = client[\"knravish_mwdb_phase_1\"]\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Create/access feature descriptor collection\n", "fd_collection = db[\"fd_collection\"]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### (Task 1's code without visualization)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import cv2\n", "import numpy as np\n", "from scipy.stats import skew\n", "\n", "import torch\n", "import torchvision.transforms as transforms\n", "\n", "import torchvision.datasets as datasets\n", "\n", "dataset_path = \"C:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Datasets\"\n", "\n", "dataset = datasets.Caltech101(\n", " root=dataset_path,\n", " download=False, # True if you wish to download for first time\n", ")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Class transform to partition image into rows x cols grid\n", "\n", "\n", "class GridPartition:\n", " def __init__(self, rows, cols):\n", " self.rows = rows\n", " self.cols = cols\n", "\n", " def __call__(self, img):\n", " img_width, img_height = img.size\n", " cell_width = img_width // self.cols\n", " cell_height = img_height // self.rows\n", "\n", " grids = []\n", " for i in range(self.rows):\n", " for j in range(self.cols):\n", " left = j * cell_width\n", " top = i * cell_height\n", " right = left + cell_width\n", " bottom = top + cell_height\n", " grid = img.crop((left, top, right, bottom))\n", " grids.append(grid)\n", "\n", " return grids\n", "\n", "\n", "def compute_color_moments(image):\n", " image = np.array(image) # Convert PIL Image to NumPy array\n", " moments = []\n", "\n", " for channel in range(3): # Iterate over RGB channels\n", " channel_data = image[:, :, channel]\n", " mean = np.mean(channel_data)\n", " std_dev = np.std(channel_data)\n", " skewness = skew(channel_data, axis=None)\n", " moments.append([mean, std_dev, skewness])\n", "\n", " return moments\n", "\n", "\n", "# Iterate over grid cells and return as 1-d array for easier resizing by torch\n", "def compute_color_moments_for_grid(grid):\n", " color_moments = [compute_color_moments(grid_cell) for grid_cell in grid]\n", " return np.array(color_moments).flatten()\n", "\n", "\n", "def combine_color_moments(grid_color_moments):\n", " return torch.Tensor(grid_color_moments).view(\n", " 10, 10, 3, 3\n", " ) # resize as needed: 10x10 grid, 3 channels per cell, 3 moments per channel\n", "\n", "\n", "CM_transform = transforms.Compose(\n", " [\n", " transforms.Resize((100, 300)), # resize to H:W=100:300\n", " GridPartition(\n", " rows=10, cols=10\n", " ), # partition into grid of 10 rows, 10 columns as a list\n", " compute_color_moments_for_grid,\n", " combine_color_moments,\n", " ]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def compute_gradient_histogram(grid_cell):\n", " histograms = []\n", "\n", " # Convert grid cell to NumPy array\n", " grid_array = np.array(grid_cell, dtype=np.uint8)\n", "\n", " # Compute the gradient using first-order central differences\n", " dx = cv2.Sobel(\n", " grid_array, cv2.CV_32F, dx=1, dy=0, ksize=1\n", " ) # first order x derivative = [-1, 0, 1]\n", " dy = cv2.Sobel(\n", " grid_array, cv2.CV_32F, dx=0, dy=1, ksize=1\n", " ) # first order y derivative = [-1, 0, 1]^T\n", "\n", " # Compute magnitude and direction of gradients\n", " magnitude = np.sqrt(dx**2 + dy**2)\n", " direction = np.arctan2(dy, dx) * 180 / np.pi # in degrees\n", "\n", " # Compute HOG - 9 bins, counted across the range of -180 to 180 degrees, weighted by gradient magnitude\n", " histogram, _ = np.histogram(direction, bins=9, range=(-180, 180), weights=magnitude)\n", "\n", " histograms.append(histogram)\n", "\n", " return histograms\n", "\n", "\n", "def compute_histograms_for_grid(grid):\n", " histograms = [compute_gradient_histogram(grid_cell) for grid_cell in grid]\n", " return np.array(histograms).flatten()\n", "\n", "\n", "def combine_histograms(grid_histograms):\n", " return torch.Tensor(grid_histograms).view(10, 10, 9)\n", "\n", "\n", "HOG_transform = transforms.Compose(\n", " [\n", " transforms.Grayscale(num_output_channels=1), # grayscale transform\n", " transforms.Resize((100, 300)), # resize to H:W=100:300\n", " GridPartition(\n", " rows=10, cols=10\n", " ), # partition into grid of 10 rows, 10 columns as a list\n", " compute_histograms_for_grid,\n", " combine_histograms,\n", " ]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import torchvision.models as models\n", "\n", "# Load model\n", "model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)\n", "\n", "# use GPU (Nvidia)\n", "if torch.cuda.is_available():\n", " dev = torch.device(\"cuda\")\n", " torch.cuda.empty_cache()\n", "else:\n", " dev = torch.device(\"cpu\")\n", "\n", "model = model.to(dev)\n", "\n", "\n", "# Feature extractor for all layers at once\n", "\n", "\n", "class FeatureExtractor(torch.nn.Module):\n", " def __init__(self, model, layers):\n", " super().__init__()\n", " self.model = model\n", " self.layers = layers\n", " self._features = {layer: None for layer in layers} # store layer outputs here\n", "\n", " # Create hooks for all specified layers at once\n", " for layer_id in layers:\n", " layer = dict(self.model.named_modules())[layer_id] # get actual layer in the model\n", " layer.register_forward_hook(self.save_outputs_hook(layer_id)) # register feature extractor hook on layer\n", "\n", " # Hook to save output of layer\n", " def save_outputs_hook(self, layer_id):\n", " def fn(_module, _input, output):\n", " self._features[layer_id] = output\n", "\n", " return fn\n", "\n", " # Forward pass returns extracted features\n", " def forward(self, input):\n", " _ = self.model(input)\n", " return self._features\n", "\n", "\n", "def resnet_extractor(image, img_channels):\n", " # ResNet50 expects 3 channel image\n", " if img_channels != 3:\n", " return (None, None, None)\n", "\n", " resized_image = (\n", " torch.Tensor(np.array(transforms.Resize((224, 224))(image)).flatten())\n", " .view(1, 3, 224, 224)\n", " .to(dev)\n", " )\n", "\n", " # Attach all hooks on model and extract features\n", " resnet_features = FeatureExtractor(model=model, layers=[\"avgpool\", \"layer3\", \"fc\"])\n", " features = resnet_features(resized_image)\n", "\n", " avgpool_2048 = features[\"avgpool\"]\n", " # Reshape the vector into row pairs of elements and average across rows\n", " avgpool_1024_fd = torch.mean(avgpool_2048.view(-1, 2), axis=1)\n", "\n", " layer3_1024_14_14 = features[\"layer3\"]\n", " # Reshape the vector into 1024 rows of 196 elements and average across rows\n", " layer3_1024_fd = torch.mean(layer3_1024_14_14.view(1024, -1), axis=1)\n", "\n", " fc_1000_fd = features[\"fc\"].view(1000)\n", "\n", " return (\n", " avgpool_1024_fd.detach().cpu().tolist(),\n", " layer3_1024_fd.detach().cpu().tolist(),\n", " fc_1000_fd.detach().cpu().tolist(),\n", " )\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Process all images and store in collection (one-time processing)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\rknar\\AppData\\Local\\Temp\\ipykernel_8384\\3604855272.py:35: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.\n", " skewness = skew(channel_data, axis=None)\n" ] } ], "source": [ "start = 0\n", "stop = len(dataset)\n", "step = 1\n", "\n", "for idx in range(start, stop, step):\n", " img, label = dataset[idx]\n", "\n", " img_shape = np.array(img).shape\n", "\n", " if len(img_shape) >= 3 and img_shape[2] >= 3:\n", " cm_fd = CM_transform(img).tolist()\n", " img_channels = 3\n", " else:\n", " # no color moments for grayscale images\n", " # TODO: perhaps we could do conversion by stacking channels? or is there some grayscale-to-RGB function?\n", " cm_fd = None\n", " img_channels = 1\n", "\n", " hog_fd = HOG_transform(img).tolist()\n", " avgpool_1024_fd, layer3_1024_fd, fc_1000_fd = resnet_extractor(img, img_channels)\n", "\n", " # Store to collection\n", " fd_collection.insert_one(\n", " {\n", " \"image_id\": idx,\n", " \"true_label\": label,\n", " \"channels\": img_channels,\n", " \"cm_fd\": cm_fd,\n", " \"hog_fd\": hog_fd,\n", " \"avgpool_fd\": avgpool_1024_fd,\n", " \"layer3_fd\": layer3_1024_fd,\n", " \"fc_fd\": fc_1000_fd,\n", " }\n", " )\n" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1581" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Remove duplicates (accidental re-runs)\n", "distinct_values = fd_collection.distinct(\"image_id\")\n", "\n", "for fieldValue in distinct_values:\n", " i = 0\n", " for doc in fd_collection.find({\"image_id\": fieldValue}):\n", " if i:\n", " fd_collection.delete_one({\"_id\": doc[\"_id\"]})\n", " i += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }