{ "cells": [ { "cell_type": "markdown", "id": "23987af9", "metadata": {}, "source": [ "# šŸ“„ Security Dataset Download & Preparation\n", "\n", "This notebook downloads and prepares all security datasets for training.\n", "Run this notebook **once** before training any models.\n", "\n", "## Datasets Included:\n", "- **Phishing Detection**: Malicious URLs, phishing websites\n", "- **Malware Analysis**: PE features, Android malware\n", "- **Network Intrusion**: NSL-KDD, CICIDS, UNSW-NB15\n", "- **Web Attacks**: XSS, SQL injection, CSRF\n", "- **Threat Intelligence**: Malicious IPs, botnet C2\n", "- **DNS Security**: DGA detection\n", "- **Spam Detection**: Email classification" ] }, { "cell_type": "code", "execution_count": 10, "id": "b888df31", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n", "āœ… Dependencies installed\n" ] } ], "source": [ "# Install required packages using pip magic (ensures correct kernel environment)\n", "%pip install -q pandas numpy certifi nest_asyncio tqdm\n", "\n", "print('āœ… Dependencies installed')" ] }, { "cell_type": "code", "execution_count": 11, "id": "53a35426", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "āœ… Dataset manager imported\n" ] } ], "source": [ "import sys\n", "import asyncio\n", "from pathlib import Path\n", "\n", "# Add project path\n", "sys.path.insert(0, str(Path.cwd().parent / 'app' / 'services'))\n", "\n", "# Import dataset manager\n", "from web_security_datasets import WebSecurityDatasetManager\n", "\n", "# For Jupyter async support\n", "try:\n", " import nest_asyncio\n", " nest_asyncio.apply()\n", "except:\n", " pass\n", "\n", "print('āœ… Dataset manager imported')" ] }, { "cell_type": "code", "execution_count": 12, "id": "e831a641", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸ“Š Available Security Datasets:\n", " Categories: ['phishing', 'web_attack', 'cryptomining', 'dns', 'malware', 'threat_intel', 'logs', 'spam', 'ssl', 'intrusion']\n", " Total datasets: 18\n", " Estimated samples: 1,072,129\n", "\n", "šŸ“‹ Dataset List:\n", " • url_phishing_kaggle: Malicious vs Benign URLs (Kaggle) [phishing]\n", " • phishing_websites_uci: UCI Phishing Websites Dataset [phishing]\n", " • malware_pe_features: PE Header Malware Features [malware]\n", " • android_malware_drebin: Android Malware (Drebin-style Features) [malware]\n", " • cicids2017_ddos: CICIDS 2017 DDoS Detection [intrusion]\n", " • nsl_kdd_train: NSL-KDD Network Intrusion [intrusion]\n", " • unsw_nb15: UNSW-NB15 Network Dataset [intrusion]\n", " • ipsum_malicious_ips: IPsum Malicious IPs [threat_intel]\n", " • feodotracker_botnet: Feodo Tracker Botnet C2 [threat_intel]\n", " • urlhaus_malicious: URLhaus Malicious URLs [threat_intel]\n", " • spambase_uci: UCI Spambase [spam]\n", " • xss_payloads: XSS Attack Payloads [web_attack]\n", " • sql_injection_payloads: SQL Injection Payloads [web_attack]\n", " • http_csic_requests: HTTP CSIC 2010 Dataset [web_attack]\n", " • cryptomining_scripts: Cryptomining Script Detection [cryptomining]\n", " • dga_domains: DGA Domain Detection [dns]\n", " • ssl_certificates: SSL Certificate Analysis [ssl]\n", " • system_logs_hdfs: HDFS System Logs [logs]\n" ] } ], "source": [ "# Initialize dataset manager\n", "DATASET_DIR = Path.cwd().parent / 'datasets' / 'web_security'\n", "manager = WebSecurityDatasetManager(str(DATASET_DIR))\n", "\n", "# Show available datasets\n", "info = manager.get_available_datasets()\n", "print('šŸ“Š Available Security Datasets:')\n", "print(f' Categories: {info[\"categories\"]}')\n", "print(f' Total datasets: {len(info[\"configured\"])}')\n", "print(f' Estimated samples: {info[\"total_configured_samples\"]:,}')\n", "\n", "print('\\nšŸ“‹ Dataset List:')\n", "for ds_id, ds_info in manager.SECURITY_DATASETS.items():\n", " print(f' • {ds_id}: {ds_info[\"name\"]} [{ds_info[\"category\"]}]')" ] }, { "cell_type": "code", "execution_count": 14, "id": "17800fb7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸ“„ Downloading all security datasets...\n", " This may take 5-10 minutes on first run.\n", "\n", "\n", "šŸ“Š Download Results:\n", " āœ… Successful: 0\n", " ā­ļø Skipped: 18\n", " āŒ Failed: 0\n", "\n", " šŸ“ˆ Total samples available: 1,072,129\n" ] } ], "source": [ "# Download all datasets\n", "print('šŸ“„ Downloading all security datasets...')\n", "print(' This may take 5-10 minutes on first run.\\n')\n", "\n", "async def download_all():\n", " return await manager.download_all_datasets(force=False)\n", "\n", "results = asyncio.run(download_all())\n", "\n", "print('\\nšŸ“Š Download Results:')\n", "print(f' āœ… Successful: {len(results[\"successful\"])}')\n", "print(f' ā­ļø Skipped: {len(results[\"skipped\"])}')\n", "print(f' āŒ Failed: {len(results[\"failed\"])}')\n", "print(f'\\n šŸ“ˆ Total samples available: {results[\"total_samples\"]:,}')" ] }, { "cell_type": "code", "execution_count": 15, "id": "218aa401", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "šŸ“ Downloaded Datasets Summary:\n", "\n", " Dataset Category Samples Synthetic\n", " url_phishing_kaggle phishing 450000 No\n", " phishing_websites_uci phishing 11055 No\n", " malware_pe_features malware 4500 No\n", "android_malware_drebin malware 15000 No\n", " cicids2017_ddos intrusion 128000 No\n", " nsl_kdd_train intrusion 125973 No\n", " unsw_nb15 intrusion 175000 No\n", " ipsum_malicious_ips threat_intel 25000 No\n", " feodotracker_botnet threat_intel 5000 No\n", " urlhaus_malicious threat_intel 10000 No\n", " spambase_uci spam 4601 No\n", " xss_payloads web_attack 5000 No\n", "sql_injection_payloads web_attack 3000 No\n", " http_csic_requests web_attack 36000 No\n", " cryptomining_scripts cryptomining 5000 No\n", " dga_domains dns 50000 No\n", " ssl_certificates ssl 8000 No\n", " system_logs_hdfs logs 11000 No\n", "\n", "šŸ“Š Total: 1,072,129 samples across 18 datasets\n" ] } ], "source": [ "# Verify downloaded datasets\n", "print('\\nšŸ“ Downloaded Datasets Summary:\\n')\n", "\n", "import pandas as pd\n", "\n", "summary_data = []\n", "for ds_id, info in manager.downloaded_datasets.items():\n", " samples = info.get('actual_samples', info.get('samples', 0))\n", " category = info.get('category', 'unknown')\n", " synthetic = 'Yes' if info.get('synthetic') else 'No'\n", " \n", " summary_data.append({\n", " 'Dataset': ds_id,\n", " 'Category': category,\n", " 'Samples': samples,\n", " 'Synthetic': synthetic\n", " })\n", "\n", "summary_df = pd.DataFrame(summary_data)\n", "print(summary_df.to_string(index=False))\n", "\n", "print(f'\\nšŸ“Š Total: {summary_df[\"Samples\"].sum():,} samples across {len(summary_df)} datasets')" ] }, { "cell_type": "code", "execution_count": 16, "id": "9ccb78f2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸ” Data Quality Check:\n", "\n", "\n", "āœ… Dataset preparation complete!\n", "\n", "šŸš€ You can now run the training notebooks.\n" ] } ], "source": [ "# Quick data quality check\n", "print('šŸ” Data Quality Check:\\n')\n", "\n", "async def check_quality():\n", " for ds_id in list(manager.downloaded_datasets.keys())[:5]: # Check first 5\n", " df = await manager.load_dataset(ds_id)\n", " if df is not None:\n", " null_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100\n", " print(f' {ds_id}:')\n", " print(f' Shape: {df.shape}')\n", " print(f' Null %: {null_pct:.2f}%')\n", " print(f' Numeric cols: {len(df.select_dtypes(include=[\"number\"]).columns)}')\n", "\n", "asyncio.run(check_quality())\n", "\n", "print('\\nāœ… Dataset preparation complete!')\n", "print('\\nšŸš€ You can now run the training notebooks.')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.15.0" } }, "nbformat": 4, "nbformat_minor": 5 }