{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "03c79b89", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "id": "e517de18", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'kagglehub'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mkagglehub\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'kagglehub'" ] } ], "source": [ "import sys\n", "import kagglehub\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "id": "f7ac0941", "metadata": {}, "outputs": [], "source": [ "import pandas.core.indexes\n", "sys.modules['pandas.indexes'] = pandas.core.indexes" ] }, { "cell_type": "code", "execution_count": 3, "id": "82aec67e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading dataset from Kaggle...\n" ] }, { "ename": "NameError", "evalue": "name 'kagglehub' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[3], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloading dataset from Kaggle...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[43mkagglehub\u001b[49m\u001b[38;5;241m.\u001b[39mdataset_download(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mqingyi/wm811k-wafer-map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset downloaded to: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[1;31mNameError\u001b[0m: name 'kagglehub' is not defined" ] } ], "source": [ "print(\"Downloading dataset from Kaggle...\")\n", "path = kagglehub.dataset_download(\"qingyi/wm811k-wafer-map\")\n", "print(f\"Dataset downloaded to: {path}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "4dea4d86", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading dataset with latin1 encoding (this might take a minute)...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/_b/4pz7ygss6y7c564mcqz3grmw0000gn/T/ipykernel_3677/1757211552.py:6: VisibleDeprecationWarning: dtype(): align should be passed as Python or NumPy boolean but got `align=0`. Did you mean to pass a tuple to create a subarray type? (Deprecated NumPy 2.4)\n", " df = pickle.load(f, encoding='latin1')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Success! Total wafers in dataset: 811457\n" ] } ], "source": [ "#Making the file path and loading the file in this venv\n", "file_path = os.path.join(path, 'LSWMD.pkl')\n", "\n", "print(\"Loading dataset with latin1 encoding (this might take a minute)...\")\n", "with open(file_path, 'rb') as f:\n", " df = pickle.load(f, encoding='latin1')\n", "print(f\"Success! Total wafers in dataset: {len(df)}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "bd97a8e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data cleaned! Now you can view it.\n" ] } ], "source": [ "# 1. Clean the nested failure type column to create 'failure_class'\n", "df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None')\n", "\n", "# 2. Filter out the perfect wafers to create the 'defective_wafers' subset\n", "defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')]\n", "\n", "print(\"Data cleaned! Now you can view it.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "780e3a68", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 5 rows of the dataset:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
waferMapdieSizefailure_class
0[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...1683.0none
1[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...1683.0none
2[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...1683.0none
3[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...1683.0none
4[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...1683.0none
\n", "
" ], "text/plain": [ " waferMap dieSize failure_class\n", "0 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1683.0 none\n", "1 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1683.0 none\n", "2 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1683.0 none\n", "3 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1683.0 none\n", "4 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1683.0 none" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Shape of the first defective wafer array: (45, 48)\n", "\n", "The raw 2D array data (Notice the 0s, 1s, and 2s):\n", "[[0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " ...\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]]\n" ] } ], "source": [ "#first look at the data\n", "print(\"Top 5 rows of the dataset:\")\n", "display(df[['waferMap', 'dieSize', 'failure_class']].head())\n", "\n", "# Look at exactly what the 2D array looks like for the first defective wafer\n", "first_defect_index = defective_wafers.index[0]\n", "first_defect_array = defective_wafers.loc[first_defect_index, 'waferMap']\n", "\n", "print(\"\\nShape of the first defective wafer array:\", first_defect_array.shape)\n", "print(\"\\nThe raw 2D array data (Notice the 0s, 1s, and 2s):\")\n", "print(first_defect_array)" ] }, { "cell_type": "code", "execution_count": 17, "id": "899f308a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The FULL raw 2D array data:\n", "[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 1 1 2 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 2 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2\n", " 1 1 1 2 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1\n", " 1 2 1 1 1 1 0 0 0 0 0 0]\n", " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n", " 1 1 1 1 1 1 1 0 0 0 0 0]\n", " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n", " 1 1 1 1 1 1 2 1 0 0 0 0]\n", " [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1\n", " 1 1 1 1 2 2 1 1 0 0 0 0]\n", " [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 2 2 2 2 1 1 2 1 0 0 0]\n", " [0 0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n", " 2 2 2 2 2 1 1 1 1 2 0 0]\n", " [0 0 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2\n", " 2 2 2 2 2 2 1 1 1 2 0 0]\n", " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2\n", " 2 2 2 2 2 2 2 2 2 1 0 0]\n", " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2\n", " 2 2 2 2 1 1 1 1 1 1 1 0]\n", " [0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2\n", " 2 2 2 2 2 2 2 2 1 1 1 0]\n", " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1\n", " 2 2 2 2 2 2 2 1 2 1 1 0]\n", " [1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1\n", " 1 1 1 2 1 1 1 1 1 1 1 2]\n", " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 2 1 1 1 1 1 1 1 1 1 1 2]\n", " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1\n", " 1 1 1 1 1 1 1 1 1 1 1 2]\n", " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 2 1 1 1 2 1 1 2 2]\n", " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 2 1 1 1 2]\n", " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1\n", " 1 1 1 2 1 1 1 1 1 1 1 2]\n", " [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 2 1 2 1 1 1 1 1 1 2]\n", " [2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 0]\n", " [2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 2 0]\n", " [0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n", " 1 1 1 1 1 1 1 1 1 1 1 0]\n", " [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 0]\n", " [0 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 0 0]\n", " [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 0 0]\n", " [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 0 0 0]\n", " [0 0 0 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 0 0 0]\n", " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 0 0 0 0]\n", " [0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 0 0 0 0 0]\n", " [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2\n", " 1 1 1 1 1 1 1 0 0 0 0 0]\n", " [0 0 0 0 0 0 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 2 1 1 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1\n", " 1 1 2 1 1 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1\n", " 2 1 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1\n", " 2 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]]\n" ] } ], "source": [ "# Force NumPy to print the entire array without truncation\n", "np.set_printoptions(threshold=10000)\n", "\n", "print(\"The FULL raw 2D array data:\")\n", "print(first_defect_array)\n", "\n", "# Reset it back to default right after so future arrays don't break your terminal\n", "np.set_printoptions(threshold=1000)" ] } ], "metadata": { "kernelspec": { "display_name": "wafer_gpu", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.25" } }, "nbformat": 4, "nbformat_minor": 5 }