{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from utils import *\n",
"warnings.filterwarnings('ignore')\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"fd_collection = getCollection(\"team_5_mwdb_phase_2\", \"fd_collection\")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Applying lda on the fc_fd space to get 10 latent semantics (showing only top 10 image-weight pairs for each latent semantic)...\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Phase 2\\task_3.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 7\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mk should be a positive integer\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 9\u001b[0m selected_dim_reduction_method \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\n\u001b[0;32m 10\u001b[0m \u001b[39minput\u001b[39m(\n\u001b[0;32m 11\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mEnter dimensionality reduction method - one of \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 12\u001b[0m \u001b[39m+\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mlist\u001b[39m(valid_dim_reduction_methods\u001b[39m.\u001b[39mkeys()))\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m )\n\u001b[1;32m---> 16\u001b[0m extract_latent_semantics(\n\u001b[0;32m 17\u001b[0m fd_collection,\n\u001b[0;32m 18\u001b[0m k,\n\u001b[0;32m 19\u001b[0m selected_feature_model,\n\u001b[0;32m 20\u001b[0m selected_dim_reduction_method,\n\u001b[0;32m 21\u001b[0m top_images\u001b[39m=\u001b[39;49m\u001b[39m10\u001b[39;49m,\n\u001b[0;32m 22\u001b[0m )\n",
"File \u001b[1;32mc:\\Kaushik\\ASU\\CSE 515 - Multimedia and Web Databases\\Project\\Phase 2\\utils.py:674\u001b[0m, in \u001b[0;36mextract_latent_semantics\u001b[1;34m(fd_collection, k, feature_model, dim_reduction_method, top_images)\u001b[0m\n\u001b[0;32m 669\u001b[0m \u001b[39m# unsupervised LDA to extract topics (Latent Dirichlet Allocation)\u001b[39;00m\n\u001b[0;32m 670\u001b[0m \u001b[39m# Note: LDA takes a bit of time\u001b[39;00m\n\u001b[0;32m 671\u001b[0m \u001b[39mcase\u001b[39;00m \u001b[39m3\u001b[39m:\n\u001b[0;32m 672\u001b[0m \u001b[39m# LDA requires non-negative input data\u001b[39;00m\n\u001b[0;32m 673\u001b[0m \u001b[39m# so shift the input by subtracting the smallest value\u001b[39;00m\n\u001b[1;32m--> 674\u001b[0m min_value \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mmin(feature_vectors)\n\u001b[0;32m 675\u001b[0m feature_vectors_shifted \u001b[39m=\u001b[39m feature_vectors \u001b[39m-\u001b[39m min_value\n\u001b[0;32m 677\u001b[0m model \u001b[39m=\u001b[39m LatentDirichletAllocation(n_components\u001b[39m=\u001b[39mk, learning_method\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39monline\u001b[39m\u001b[39m\"\u001b[39m, verbose\u001b[39m=\u001b[39m\u001b[39m4\u001b[39m)\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\base.py:1151\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1144\u001b[0m estimator\u001b[39m.\u001b[39m_validate_params()\n\u001b[0;32m 1146\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m 1147\u001b[0m skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m 1148\u001b[0m prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1149\u001b[0m )\n\u001b[0;32m 1150\u001b[0m ):\n\u001b[1;32m-> 1151\u001b[0m \u001b[39mreturn\u001b[39;00m fit_method(estimator, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:665\u001b[0m, in \u001b[0;36mLatentDirichletAllocation.fit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[39mif\u001b[39;00m learning_method \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39monline\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[0;32m 664\u001b[0m \u001b[39mfor\u001b[39;00m idx_slice \u001b[39min\u001b[39;00m gen_batches(n_samples, batch_size):\n\u001b[1;32m--> 665\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_em_step(\n\u001b[0;32m 666\u001b[0m X[idx_slice, :],\n\u001b[0;32m 667\u001b[0m total_samples\u001b[39m=\u001b[39;49mn_samples,\n\u001b[0;32m 668\u001b[0m batch_update\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 669\u001b[0m parallel\u001b[39m=\u001b[39;49mparallel,\n\u001b[0;32m 670\u001b[0m )\n\u001b[0;32m 671\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 672\u001b[0m \u001b[39m# batch update\u001b[39;00m\n\u001b[0;32m 673\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_em_step(\n\u001b[0;32m 674\u001b[0m X, total_samples\u001b[39m=\u001b[39mn_samples, batch_update\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, parallel\u001b[39m=\u001b[39mparallel\n\u001b[0;32m 675\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:524\u001b[0m, in \u001b[0;36mLatentDirichletAllocation._em_step\u001b[1;34m(self, X, total_samples, batch_update, parallel)\u001b[0m\n\u001b[0;32m 497\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"EM update for 1 iteration.\u001b[39;00m\n\u001b[0;32m 498\u001b[0m \n\u001b[0;32m 499\u001b[0m \u001b[39mupdate `_component` by batch VB or online VB.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 520\u001b[0m \u001b[39m Unnormalized document topic distribution.\u001b[39;00m\n\u001b[0;32m 521\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 523\u001b[0m \u001b[39m# E-step\u001b[39;00m\n\u001b[1;32m--> 524\u001b[0m _, suff_stats \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_e_step(\n\u001b[0;32m 525\u001b[0m X, cal_sstats\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, random_init\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, parallel\u001b[39m=\u001b[39;49mparallel\n\u001b[0;32m 526\u001b[0m )\n\u001b[0;32m 528\u001b[0m \u001b[39m# M-step\u001b[39;00m\n\u001b[0;32m 529\u001b[0m \u001b[39mif\u001b[39;00m batch_update:\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:467\u001b[0m, in \u001b[0;36mLatentDirichletAllocation._e_step\u001b[1;34m(self, X, cal_sstats, random_init, parallel)\u001b[0m\n\u001b[0;32m 465\u001b[0m \u001b[39mif\u001b[39;00m parallel \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 466\u001b[0m parallel \u001b[39m=\u001b[39m Parallel(n_jobs\u001b[39m=\u001b[39mn_jobs, verbose\u001b[39m=\u001b[39m\u001b[39mmax\u001b[39m(\u001b[39m0\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mverbose \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m))\n\u001b[1;32m--> 467\u001b[0m results \u001b[39m=\u001b[39m parallel(\n\u001b[0;32m 468\u001b[0m delayed(_update_doc_distribution)(\n\u001b[0;32m 469\u001b[0m X[idx_slice, :],\n\u001b[0;32m 470\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexp_dirichlet_component_,\n\u001b[0;32m 471\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdoc_topic_prior_,\n\u001b[0;32m 472\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_doc_update_iter,\n\u001b[0;32m 473\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmean_change_tol,\n\u001b[0;32m 474\u001b[0m cal_sstats,\n\u001b[0;32m 475\u001b[0m random_state,\n\u001b[0;32m 476\u001b[0m )\n\u001b[0;32m 477\u001b[0m \u001b[39mfor\u001b[39;49;00m idx_slice \u001b[39min\u001b[39;49;00m gen_even_slices(X\u001b[39m.\u001b[39;49mshape[\u001b[39m0\u001b[39;49m], n_jobs)\n\u001b[0;32m 478\u001b[0m )\n\u001b[0;32m 480\u001b[0m \u001b[39m# merge result\u001b[39;00m\n\u001b[0;32m 481\u001b[0m doc_topics, sstats_list \u001b[39m=\u001b[39m \u001b[39mzip\u001b[39m(\u001b[39m*\u001b[39mresults)\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\utils\\parallel.py:65\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 60\u001b[0m config \u001b[39m=\u001b[39m get_config()\n\u001b[0;32m 61\u001b[0m iterable_with_config \u001b[39m=\u001b[39m (\n\u001b[0;32m 62\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 63\u001b[0m \u001b[39mfor\u001b[39;00m delayed_func, args, kwargs \u001b[39min\u001b[39;00m iterable\n\u001b[0;32m 64\u001b[0m )\n\u001b[1;32m---> 65\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__call__\u001b[39;49m(iterable_with_config)\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\joblib\\parallel.py:1863\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1861\u001b[0m output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1862\u001b[0m \u001b[39mnext\u001b[39m(output)\n\u001b[1;32m-> 1863\u001b[0m \u001b[39mreturn\u001b[39;00m output \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_generator \u001b[39melse\u001b[39;00m \u001b[39mlist\u001b[39;49m(output)\n\u001b[0;32m 1865\u001b[0m \u001b[39m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1866\u001b[0m \u001b[39m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1867\u001b[0m \u001b[39m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1868\u001b[0m \u001b[39m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1869\u001b[0m \u001b[39m# callback.\u001b[39;00m\n\u001b[0;32m 1870\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_lock:\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\joblib\\parallel.py:1792\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1790\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_dispatched_batches \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m 1791\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_dispatched_tasks \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m-> 1792\u001b[0m res \u001b[39m=\u001b[39m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1793\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mn_completed_tasks \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m 1794\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprint_progress()\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\utils\\parallel.py:127\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 125\u001b[0m config \u001b[39m=\u001b[39m {}\n\u001b[0;32m 126\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mconfig):\n\u001b[1;32m--> 127\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfunction(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\rknar\\.pyenv\\pyenv-win\\versions\\3.10.5\\lib\\site-packages\\sklearn\\decomposition\\_lda.py:144\u001b[0m, in \u001b[0;36m_update_doc_distribution\u001b[1;34m(X, exp_topic_word_distr, doc_topic_prior, max_doc_update_iter, mean_change_tol, cal_sstats, random_state)\u001b[0m\n\u001b[0;32m 140\u001b[0m last_d \u001b[39m=\u001b[39m doc_topic_d\n\u001b[0;32m 142\u001b[0m \u001b[39m# The optimal phi_{dwk} is proportional to\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[39m# exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).\u001b[39;00m\n\u001b[1;32m--> 144\u001b[0m norm_phi \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mdot(exp_doc_topic_d, exp_topic_word_d) \u001b[39m+\u001b[39m eps\n\u001b[0;32m 146\u001b[0m doc_topic_d \u001b[39m=\u001b[39m exp_doc_topic_d \u001b[39m*\u001b[39m np\u001b[39m.\u001b[39mdot(cnts \u001b[39m/\u001b[39m norm_phi, exp_topic_word_d\u001b[39m.\u001b[39mT)\n\u001b[0;32m 147\u001b[0m \u001b[39m# Note: adds doc_topic_prior to doc_topic_d, in-place.\u001b[39;00m\n",
"File \u001b[1;32m<__array_function__ internals>:180\u001b[0m, in \u001b[0;36mdot\u001b[1;34m(*args, **kwargs)\u001b[0m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"selected_feature_model = valid_feature_models[\n",
" str(input(\"Enter feature model - one of \" + str(list(valid_feature_models.keys()))))\n",
"]\n",
"\n",
"k = int(input(\"Enter value of k: \"))\n",
"if k < 1:\n",
" raise ValueError(\"k should be a positive integer\")\n",
"\n",
"selected_dim_reduction_method = str(\n",
" input(\n",
" \"Enter dimensionality reduction method - one of \"\n",
" + str(list(valid_dim_reduction_methods.keys()))\n",
" )\n",
")\n",
"\n",
"extract_latent_semantics(\n",
" fd_collection,\n",
" k,\n",
" selected_feature_model,\n",
" selected_dim_reduction_method,\n",
" top_images=10,\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}