{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "03c79b89", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "id": "e517de18", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'kagglehub'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mkagglehub\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'kagglehub'" ] } ], "source": [ "import sys\n", "import kagglehub\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "id": "f7ac0941", "metadata": {}, "outputs": [], "source": [ "import pandas.core.indexes\n", "sys.modules['pandas.indexes'] = pandas.core.indexes" ] }, { "cell_type": "code", "execution_count": 3, "id": "82aec67e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading dataset from Kaggle...\n" ] }, { "ename": "NameError", "evalue": "name 'kagglehub' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[3], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloading dataset from Kaggle...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[43mkagglehub\u001b[49m\u001b[38;5;241m.\u001b[39mdataset_download(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mqingyi/wm811k-wafer-map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset downloaded to: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[1;31mNameError\u001b[0m: name 'kagglehub' is not defined" ] } ], "source": [ "print(\"Downloading dataset from Kaggle...\")\n", "path = kagglehub.dataset_download(\"qingyi/wm811k-wafer-map\")\n", "print(f\"Dataset downloaded to: {path}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "4dea4d86", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading dataset with latin1 encoding (this might take a minute)...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/_b/4pz7ygss6y7c564mcqz3grmw0000gn/T/ipykernel_3677/1757211552.py:6: VisibleDeprecationWarning: dtype(): align should be passed as Python or NumPy boolean but got `align=0`. Did you mean to pass a tuple to create a subarray type? (Deprecated NumPy 2.4)\n", " df = pickle.load(f, encoding='latin1')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Success! Total wafers in dataset: 811457\n" ] } ], "source": [ "#Making the file path and loading the file in this venv\n", "file_path = os.path.join(path, 'LSWMD.pkl')\n", "\n", "print(\"Loading dataset with latin1 encoding (this might take a minute)...\")\n", "with open(file_path, 'rb') as f:\n", " df = pickle.load(f, encoding='latin1')\n", "print(f\"Success! Total wafers in dataset: {len(df)}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "bd97a8e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data cleaned! Now you can view it.\n" ] } ], "source": [ "# 1. Clean the nested failure type column to create 'failure_class'\n", "df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None')\n", "\n", "# 2. Filter out the perfect wafers to create the 'defective_wafers' subset\n", "defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')]\n", "\n", "print(\"Data cleaned! Now you can view it.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "780e3a68", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top 5 rows of the dataset:\n" ] }, { "data": { "text/html": [ "
| \n", " | waferMap | \n", "dieSize | \n", "failure_class | \n", "
|---|---|---|---|
| 0 | \n", "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | \n", "1683.0 | \n", "none | \n", "
| 1 | \n", "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | \n", "1683.0 | \n", "none | \n", "
| 2 | \n", "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | \n", "1683.0 | \n", "none | \n", "
| 3 | \n", "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | \n", "1683.0 | \n", "none | \n", "
| 4 | \n", "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | \n", "1683.0 | \n", "none | \n", "