ginihumer · Mar 28, 2024
diff --git a/‎notebooks/CLOOB/export_data_cloob_ablation.ipynb
+259 b/‎notebooks/CLOOB/export_data_cloob_ablation.ipynb
+259
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Export Data for CLOOB Ablation Study\n",
+    "### This notebook exports data for the CLOOB ablation analysis done after the interactive article was accepted by VISxAI. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install git+https://github.com/ginihumer/Amumo.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import amumo\n",
+    "from amumo import data as am_data\n",
+    "from amumo import utils as am_utils\n",
+    "from amumo import model as am_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "def create_dir_if_not_exists(dir):\n",
+    "    if not os.path.exists(dir):\n",
+    "        os.mkdir(dir)\n",
+    "    return dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'./exported_data_checkpoints/'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "export_directory = './exported_data_checkpoints/'\n",
+    "create_dir_if_not_exists(export_directory)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text-Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def export_data(dataset_name, images, prompts, models):\n",
+    "\n",
+    "    # create folder structure\n",
+    "    dataset_directory = create_dir_if_not_exists(export_directory + dataset_name)\n",
+    "    similarities_dir = create_dir_if_not_exists(dataset_directory + '/similarities')\n",
+    "\n",
+    "    # export projections and similarities\n",
+    "    import torch\n",
+    "    from sklearn.decomposition import PCA\n",
+    "    from openTSNE import TSNE\n",
+    "    from umap import UMAP\n",
+    "    import numpy as np\n",
+    "    import pandas as pd\n",
+    "    import json\n",
+    "\n",
+    "    # if there already exists a dataset with projections from prior exports, load it\n",
+    "    if not os.path.exists(dataset_directory + '/projections.csv'):\n",
+    "        projections_df = pd.DataFrame({'emb_id': list(np.arange(0,len(images),1))+list(np.arange(0,len(prompts),1)), 'data_type':['image']*len(images)+['text']*len(prompts)})\n",
+    "    else:\n",
+    "        projections_df = pd.read_csv(dataset_directory + '/projections.csv')\n",
+    "    \n",
+    "\n",
+    "    for model in models:\n",
+    "        # compute embeddings\n",
+    "        image_embedding_gap, text_embedding_gap, logit_scale = am_utils.get_embedding(model, dataset_name, images, prompts)\n",
+    "        image_embedding_nogap, text_embedding_nogap = am_utils.get_closed_modality_gap(image_embedding_gap, text_embedding_gap)\n",
+    "        \n",
+    "        for image_embedding, text_embedding, mode in [(image_embedding_gap, text_embedding_gap, ''), (image_embedding_nogap, text_embedding_nogap, '_nogap')]:\n",
+    "            \n",
+    "            # compute similarities\n",
+    "            similarity_image_text, similarity = am_utils.get_similarity(image_embedding, text_embedding)\n",
+    "            np.savetxt('%s/%s%s.csv'%(similarities_dir,model.model_name,mode), similarity, delimiter=',')\n",
+    "            \n",
+    "            # compute meta information and similarity clustering\n",
+    "            meta_info = {}\n",
+    "            meta_info['gap_distance'] = float(am_utils.get_modality_distance(image_embedding, text_embedding))\n",
+    "            meta_info['loss'] = float(am_utils.calculate_val_loss(image_embedding, text_embedding, logit_scale.exp()))\n",
+    "\n",
+    "            idcs, clusters, clusters_unsorted = am_utils.get_cluster_sorting(similarity_image_text)\n",
+    "            cluster_labels = []\n",
+    "            cluster_sizes = []\n",
+    "            for c in set(clusters):\n",
+    "                cluster_size = int(np.count_nonzero(clusters==c))\n",
+    "                cluster_label = am_utils.get_textual_label_for_cluster(np.where(clusters_unsorted==c)[0], prompts)\n",
+    "                cluster_labels.append(cluster_label)\n",
+    "                cluster_sizes.append(cluster_size)\n",
+    "\n",
+    "            idcs_reverse = np.argsort(idcs)\n",
+    "            meta_info['cluster_sort_idcs'] = idcs.tolist()\n",
+    "            meta_info['cluster_sort_idcs_reverse'] = idcs_reverse.tolist()\n",
+    "            meta_info['cluster_sizes'] = cluster_sizes\n",
+    "            meta_info['cluster_labels'] = cluster_labels\n",
+    "            # print(meta_info)\n",
+    "\n",
+    "            with open(\"%s/%s%s_meta_info.json\"%(similarities_dir, model.model_name, mode), \"w\") as file:\n",
+    "                json.dump(meta_info, file)\n",
+    "\n",
+    "            # compute projections\n",
+    "            embedding = np.array(torch.concatenate([image_embedding, text_embedding]))\n",
+    "\n",
+    "            projection_methods = {\n",
+    "                'PCA': PCA,\n",
+    "                'UMAP': UMAP,\n",
+    "                'TSNE': TSNE\n",
+    "            }\n",
+    "            for method in projection_methods.keys():\n",
+    "                if method == 'PCA':\n",
+    "                    proj = projection_methods[method](n_components=2)\n",
+    "                else:\n",
+    "                    proj = projection_methods[method](n_components=2, metric='cosine', random_state=31415)\n",
+    "                \n",
+    "                if method == 'TSNE':\n",
+    "                    low_dim_data = proj.fit(embedding)\n",
+    "                else:\n",
+    "                    low_dim_data = proj.fit_transform(embedding)\n",
+    "                \n",
+    "                projections_df['%s%s_%s_x'%(model.model_name, mode, method)] = low_dim_data[:,0]\n",
+    "                projections_df['%s%s_%s_y'%(model.model_name, mode, method)] = low_dim_data[:,1]\n",
+    "\n",
+    "\n",
+    "    projections_df.to_csv(dataset_directory + '/projections.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Christina\\AppData\\Local\\Temp\\ipykernel_31664\\330881050.py:20: FutureWarning: The input object of type 'Image' is an array-like implementing one of the corresponding protocols (`__array__`, `__array_interface__` or `__array_struct__`); but not a sequence (or 0-D). In the future, this object will be coerced as if it was first converted using `np.array(obj)`. To retain the old behaviour, you have to either modify the type 'Image', or assign to an empty array created with `np.empty(correct_shape, dtype=object)`.\n",
+      "  self.all_images = np.array(all_images)\n",
+      "C:\\Users\\Christina\\AppData\\Local\\Temp\\ipykernel_31664\\330881050.py:20: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  self.all_images = np.array(all_images)\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# reuse mscoco subset from previous analysis\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "\n",
+    "class Custom_Dataset(am_data.DatasetInterface):\n",
+    "    name = 'MSCOCO-Val'\n",
+    "\n",
+    "    def __init__(self, path, seed=54, batch_size=None):\n",
+    "        # create triplet dataset if it does not exist\n",
+    "        super().__init__(path, seed, batch_size)\n",
+    "        # path: path to the triplet dataset\n",
+    "        image_paths = [path + \"images/%i.jpg\"%i for i in range(100)]\n",
+    "\n",
+    "        all_images = []\n",
+    "        for image_path in image_paths:\n",
+    "            with open(image_path, \"rb\") as fopen:\n",
+    "                image = Image.open(fopen).convert(\"RGB\")\n",
+    "                all_images.append(image)\n",
+    "\n",
+    "        self.all_images = np.array(all_images)\n",
+    "        \n",
+    "        with open(path + \"/prompts.txt\", \"r\") as file:\n",
+    "            self.all_prompts = file.read().splitlines()\n",
+    "\n",
+    "mscoco_val_dataset_name = \"MSCOCO-Val_size-100\"\n",
+    "dataset_mscoco_val = Custom_Dataset(export_directory + mscoco_val_dataset_name + '/')\n",
+    "mscoco_val_images, mscoco_val_prompts = dataset_mscoco_val.get_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "found cached embeddings for MSCOCO-Val_size-100_ImageBind_huge\n"
+     ]
+    }
+   ],
+   "source": [
+    "# TODO: export data for the models from the ablation study\n",
+    "export_data(mscoco_val_dataset_name, mscoco_val_images, mscoco_val_prompts, [am_model.ImageBind_Model()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}