{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03c79b89",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e517de18",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'kagglehub'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mkagglehub\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'kagglehub'"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import kagglehub\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f7ac0941",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas.core.indexes\n",
    "sys.modules['pandas.indexes'] = pandas.core.indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "82aec67e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading dataset from Kaggle...\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'kagglehub' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[3], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloading dataset from Kaggle...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[43mkagglehub\u001b[49m\u001b[38;5;241m.\u001b[39mdataset_download(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mqingyi/wm811k-wafer-map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset downloaded to: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[1;31mNameError\u001b[0m: name 'kagglehub' is not defined"
     ]
    }
   ],
   "source": [
    "print(\"Downloading dataset from Kaggle...\")\n",
    "path = kagglehub.dataset_download(\"qingyi/wm811k-wafer-map\")\n",
    "print(f\"Dataset downloaded to: {path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4dea4d86",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading dataset with latin1 encoding (this might take a minute)...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_b/4pz7ygss6y7c564mcqz3grmw0000gn/T/ipykernel_3677/1757211552.py:6: VisibleDeprecationWarning: dtype(): align should be passed as Python or NumPy boolean but got `align=0`. Did you mean to pass a tuple to create a subarray type? (Deprecated NumPy 2.4)\n",
      "  df = pickle.load(f, encoding='latin1')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Success! Total wafers in dataset: 811457\n"
     ]
    }
   ],
   "source": [
    "#Making the file path and loading the file in this venv\n",
    "file_path = os.path.join(path, 'LSWMD.pkl')\n",
    "\n",
    "print(\"Loading dataset with latin1 encoding (this might take a minute)...\")\n",
    "with open(file_path, 'rb') as f:\n",
    "    df = pickle.load(f, encoding='latin1')\n",
    "print(f\"Success! Total wafers in dataset: {len(df)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "bd97a8e5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data cleaned! Now you can view it.\n"
     ]
    }
   ],
   "source": [
    "# 1. Clean the nested failure type column to create 'failure_class'\n",
    "df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None')\n",
    "\n",
    "# 2. Filter out the perfect wafers to create the 'defective_wafers' subset\n",
    "defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')]\n",
    "\n",
    "print(\"Data cleaned! Now you can view it.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "780e3a68",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top 5 rows of the dataset:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>waferMap</th>\n",
       "      <th>dieSize</th>\n",
       "      <th>failure_class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...</td>\n",
       "      <td>1683.0</td>\n",
       "      <td>none</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...</td>\n",
       "      <td>1683.0</td>\n",
       "      <td>none</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...</td>\n",
       "      <td>1683.0</td>\n",
       "      <td>none</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...</td>\n",
       "      <td>1683.0</td>\n",
       "      <td>none</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...</td>\n",
       "      <td>1683.0</td>\n",
       "      <td>none</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            waferMap  dieSize failure_class\n",
       "0  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   1683.0          none\n",
       "1  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   1683.0          none\n",
       "2  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   1683.0          none\n",
       "3  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   1683.0          none\n",
       "4  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   1683.0          none"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Shape of the first defective wafer array: (45, 48)\n",
      "\n",
      "The raw 2D array data (Notice the 0s, 1s, and 2s):\n",
      "[[0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " ...\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]]\n"
     ]
    }
   ],
   "source": [
    "#first look at the data\n",
    "print(\"Top 5 rows of the dataset:\")\n",
    "display(df[['waferMap', 'dieSize', 'failure_class']].head())\n",
    "\n",
    "# Look at exactly what the 2D array looks like for the first defective wafer\n",
    "first_defect_index = defective_wafers.index[0]\n",
    "first_defect_array = defective_wafers.loc[first_defect_index, 'waferMap']\n",
    "\n",
    "print(\"\\nShape of the first defective wafer array:\", first_defect_array.shape)\n",
    "print(\"\\nThe raw 2D array data (Notice the 0s, 1s, and 2s):\")\n",
    "print(first_defect_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "899f308a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The FULL raw 2D array data:\n",
      "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 1 1 2 0 0 0 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 2 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2\n",
      "  1 1 1 2 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1\n",
      "  1 2 1 1 1 1 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n",
      "  1 1 1 1 1 1 1 0 0 0 0 0]\n",
      " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n",
      "  1 1 1 1 1 1 2 1 0 0 0 0]\n",
      " [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1\n",
      "  1 1 1 1 2 2 1 1 0 0 0 0]\n",
      " [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 2 2 2 2 1 1 2 1 0 0 0]\n",
      " [0 0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n",
      "  2 2 2 2 2 1 1 1 1 2 0 0]\n",
      " [0 0 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2\n",
      "  2 2 2 2 2 2 1 1 1 2 0 0]\n",
      " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2\n",
      "  2 2 2 2 2 2 2 2 2 1 0 0]\n",
      " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2\n",
      "  2 2 2 2 1 1 1 1 1 1 1 0]\n",
      " [0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2\n",
      "  2 2 2 2 2 2 2 2 1 1 1 0]\n",
      " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n",
      "  2 2 2 2 2 2 2 1 2 1 1 0]\n",
      " [1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1\n",
      "  1 1 1 2 1 1 1 1 1 1 1 2]\n",
      " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  2 1 1 1 1 1 1 1 1 1 1 2]\n",
      " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1\n",
      "  1 1 1 1 1 1 1 1 1 1 1 2]\n",
      " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 2 1 1 1 2 1 1 2 2]\n",
      " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 2 1 1 1 2]\n",
      " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 2 1 1 1 1 1 1 1 2]\n",
      " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 2 1 2 1 1 1 1 1 1 2]\n",
      " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 1 1 0]\n",
      " [2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 1 2 0]\n",
      " [0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n",
      "  1 1 1 1 1 1 1 1 1 1 1 0]\n",
      " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 1 1 0]\n",
      " [0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 1 0 0]\n",
      " [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 1 0 0]\n",
      " [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 0 0 0]\n",
      " [0 0 0 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 1 0 0 0]\n",
      " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 1 0 0 0 0]\n",
      " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 1 1 1 0 0 0 0 0]\n",
      " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n",
      "  1 1 1 1 1 1 1 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 2 1 1 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1\n",
      "  1 1 2 1 1 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
      "  1 1 1 1 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1\n",
      "  2 1 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1\n",
      "  2 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0\n",
      "  0 0 0 0 0 0 0 0 0 0 0 0]]\n"
     ]
    }
   ],
   "source": [
    "# Force NumPy to print the entire array without truncation\n",
    "np.set_printoptions(threshold=10000)\n",
    "\n",
    "print(\"The FULL raw 2D array data:\")\n",
    "print(first_defect_array)\n",
    "\n",
    "# Reset it back to default right after so future arrays don't break your terminal\n",
    "np.set_printoptions(threshold=1000)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "wafer_gpu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.25"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}