TensorCat commited on Apr 14

Commit

052d67e

verified ·

1 Parent(s): 5bfa0f8

Upload 30 files

Browse files

Files changed (31) hide show

.gitattributes +3 -0
UM_Handbook/(Demo Pilot)FineTune_QWEN3_UM_Handbook_en.ipynb +1531 -0
UM_Handbook/Dataset/Manual_Index/UM_Manual_Core_Question_Index.json +0 -0
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Draft_Build_Report.json +30 -0
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft.jsonl +0 -0
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft_pretty.json +0 -0
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft.jsonl +0 -0
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft_pretty.json +0 -0
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata.jsonl +0 -0
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata_pretty.json +0 -0
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl +0 -0
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready_pretty.json +0 -0
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl +0 -0
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_pretty.json +0 -0
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_report.json +14 -0
UM_Handbook/Dataset/markdown/complete_handbook_structured.md +0 -0
UM_Handbook/Dataset/markdown/general_handbook_structured.md +488 -0
UM_Handbook/Dataset/pdf/Complete Handbook.pdf +3 -0
UM_Handbook/Dataset/pdf/General Handbook.pdf +3 -0
UM_Handbook/Dataset/reports/um_handbook_markdown_report.json +2771 -0
UM_Handbook/FineTune_QWEN3_UM_Handbook_optimized_1.ipynb +0 -0
UM_Handbook/UM_Handbook_Markdown_Preprocess.py +286 -0
UM_Handbook/UM_SFT_QA_Dataset_Builder_from_Index.py +620 -0
UM_Handbook/UM_Source_Chunk_Dataset_Builder.py +265 -0
UM_Handbook/assets/TensorCat.png +0 -0
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/chat_template.jinja +89 -0
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/config.json +71 -0
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/generation_config.json +13 -0
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json +3 -0
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer_config.json +29 -0
UM_Handbook/um_handbook_config.py +230 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+UM_Handbook/Dataset/pdf/Complete[[:space:]]Handbook.pdf filter=lfs diff=lfs merge=lfs -text
+UM_Handbook/Dataset/pdf/General[[:space:]]Handbook.pdf filter=lfs diff=lfs merge=lfs -text
+UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text

UM_Handbook/(Demo Pilot)FineTune_QWEN3_UM_Handbook_en.ipynb ADDED Viewed

	@@ -0,0 +1,1531 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ac09de66",
+   "metadata": {},
+   "source": [
+    "# Qwen3-8B UM Handbook LoRA / QLoRA Fine-tuning\n",
+    "\n",
+    "This notebook keeps the training logic from `finetune_qwen3_um_handbook_v3.py` and organizes it into separate notebook sections for DICC.\n",
+    "\n",
+    "## Workflow\n",
+    "1. Check the environment and available devices\n",
+    "2. Read and validate `SFT_QA_Training_Ready.jsonl`\n",
+    "3. Convert the QA data into prompt-completion format\n",
+    "4. Split the data into training and validation sets\n",
+    "5. Download the Qwen3-8B base model into a local directory\n",
+    "6. Select the backend automatically: **CUDA > MPS > CPU**\n",
+    "7. Use **4-bit QLoRA** on CUDA and standard LoRA on MPS / CPU\n",
+    "8. Train with `TRL SFTTrainer`\n",
+    "9. Evaluate with both loss-based and generation-based metrics\n",
+    "10. Save the LoRA adapter, merged model, `.pt` file, metrics, and predictions\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71f10012",
+   "metadata": {},
+   "source": [
+    "## Part 1 - Install dependencies\n",
+    "\n",
+    "Run this cell only if the current DICC kernel does not already have the required packages.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d091473c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install -U torch transformers accelerate datasets trl peft bitsandbytes sentencepiece evaluate rouge_score bert_score sacrebleu huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f44d18ff",
+   "metadata": {},
+   "source": [
+    "## Part 2 - Import libraries\n",
+    "\n",
+    "This section imports the libraries used in the training pipeline.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "47a8b3f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "import gc\n",
+    "import json\n",
+    "import math\n",
+    "import random\n",
+    "import re\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "from typing import Dict, List\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "from huggingface_hub import snapshot_download\n",
+    "from peft import LoraConfig, PeftModel\n",
+    "import transformers\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    BitsAndBytesConfig,\n",
+    "    set_seed,\n",
+    ")\n",
+    "from trl import SFTConfig, SFTTrainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "469e1466",
+   "metadata": {},
+   "source": [
+    "## Part 3 - Configuration\n",
+    "\n",
+    "This section defines project paths, model paths, output paths, and training hyperparameters.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "59d68547",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PROJECT_ROOT       = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook\n",
+      "DATASET_ROOT       = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset\n",
+      "DATASET_PATH       = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl\n",
+      "BASE_MODEL_DIR     = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n",
+      "OUTPUT_ROOT        = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook\n",
+      "USE_4BIT           = True\n",
+      "MAX_GRAD_NORM     = 1.0\n",
+      "PACKING           = False\n",
+      "GRADIENT_CHECKPOINTING = True\n",
+      "LOW_CPU_MEM_USAGE = True\n",
+      "USE_FLASH_ATTENTION_2_IF_AVAILABLE = False\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ============================================================\n",
+    "# CONFIG\n",
+    "# ============================================================\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "WARMUP_STEPS = 20\n",
+    "\n",
+    "# Project root\n",
+    "PROJECT_ROOT = Path(\"/scr/user/kevin2002/TensorCat/NLP/UM_Handbook\")\n",
+    "\n",
+    "# Dataset paths\n",
+    "DATASET_ROOT = PROJECT_ROOT / \"Dataset\" / \"SFT_Dataset\"\n",
+    "DATASET_PATH = DATASET_ROOT / \"SFT_QA_Training_Ready.jsonl\"\n",
+    "\n",
+    "# Base model selection\n",
+    "BASE_MODEL_NAME = \"Qwen/Qwen3-8B\"\n",
+    "BASE_MODEL_LOCAL_DIR = PROJECT_ROOT / \"models\" / \"Qwen3-8B\"\n",
+    "\n",
+    "# Output paths\n",
+    "OUTPUT_ROOT = PROJECT_ROOT / \"outputs\" / \"qwen3_um_handbook\"\n",
+    "ADAPTER_OUTPUT_DIR = OUTPUT_ROOT / \"lora_adapter\"\n",
+    "MERGED_MODEL_DIR = OUTPUT_ROOT / \"merged_model\"\n",
+    "FINAL_PT_PATH = OUTPUT_ROOT / \"Qwen3-8B-Instruct_UM_Handbook.pt\"\n",
+    "METRICS_JSON_PATH = OUTPUT_ROOT / \"final_metrics.json\"\n",
+    "PREDICTIONS_JSONL_PATH = OUTPUT_ROOT / \"validation_predictions.jsonl\"\n",
+    "TRAIN_VAL_SPLIT_JSON_PATH = OUTPUT_ROOT / \"dataset_split_summary.json\"\n",
+    "\n",
+    "# Data / prompt settings\n",
+    "SYSTEM_PROMPT = (\n",
+    "    \"You are an academic assistant for the Faculty of Computer Science and \"\n",
+    "    \"Information Technology, Universiti Malaya. Answer questions accurately \"\n",
+    "    \"and only using handbook-consistent information. If the handbook does not support \"\n",
+    "    \"a claim, avoid inventing details.\"\n",
+    ")\n",
+    "TRAIN_VAL_RATIO = 0.90\n",
+    "MAX_SEQ_LENGTH = 1024\n",
+    "RANDOM_SEED = 42\n",
+    "\n",
+    "# LoRA / QLoRA settings\n",
+    "USE_4BIT = True\n",
+    "LORA_R = 32\n",
+    "LORA_ALPHA = 64\n",
+    "LORA_DROPOUT = 0.05\n",
+    "LORA_TARGET_MODULES = [\n",
+    "    \"q_proj\",\n",
+    "    \"k_proj\",\n",
+    "    \"v_proj\",\n",
+    "    \"o_proj\",\n",
+    "    \"gate_proj\",\n",
+    "    \"up_proj\",\n",
+    "    \"down_proj\",\n",
+    "]\n",
+    "\n",
+    "# Training settings\n",
+    "NUM_TRAIN_EPOCHS = 6\n",
+    "PER_DEVICE_TRAIN_BATCH_SIZE = 2\n",
+    "PER_DEVICE_EVAL_BATCH_SIZE = 2\n",
+    "GRADIENT_ACCUMULATION_STEPS = 8\n",
+    "LEARNING_RATE = 2e-4\n",
+    "WEIGHT_DECAY = 0.01\n",
+    "WARMUP_RATIO = 0.05\n",
+    "LOGGING_STEPS = 10\n",
+    "SAVE_STEPS = 50\n",
+    "EVAL_STEPS = 50\n",
+    "\n",
+    "MAX_GRAD_NORM = 1.0\n",
+    "PACKING = False\n",
+    "GRADIENT_CHECKPOINTING = True\n",
+    "LOW_CPU_MEM_USAGE = True\n",
+    "USE_FLASH_ATTENTION_2_IF_AVAILABLE = False\n",
+    "\n",
+    "# Save / display settings\n",
+    "SAVE_MERGED_MODEL = True\n",
+    "SAVE_TOKENIZER_WITH_MERGED = True\n",
+    "NUM_PRINTED_PREDICTIONS = 5\n",
+    "\n",
+    "\n",
+    "# Generation eval settings\n",
+    "MAX_NEW_TOKENS_EVAL = 192\n",
+    "NUM_EVAL_SAMPLES_FOR_GENERATION = None  # None = use full validation set\n",
+    "DO_SAMPLE_EVAL = False\n",
+    "TEMPERATURE_EVAL = 0.7\n",
+    "TOP_P_EVAL = 0.9\n",
+    "\n",
+    "# Final export settings\n",
+    "SAVE_SINGLE_PT = True\n",
+    "\n",
+    "# Create the main directories early\n",
+    "for path in [PROJECT_ROOT, DATASET_ROOT, BASE_MODEL_LOCAL_DIR.parent, OUTPUT_ROOT]:\n",
+    "    path.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "print(\"PROJECT_ROOT       =\", PROJECT_ROOT)\n",
+    "print(\"DATASET_ROOT       =\", DATASET_ROOT)\n",
+    "print(\"DATASET_PATH       =\", DATASET_PATH)\n",
+    "print(\"BASE_MODEL_DIR     =\", BASE_MODEL_LOCAL_DIR)\n",
+    "print(\"OUTPUT_ROOT        =\", OUTPUT_ROOT)\n",
+    "print(\"USE_4BIT           =\", USE_4BIT)\n",
+    "\n",
+    "\n",
+    "print(\"MAX_GRAD_NORM     =\", MAX_GRAD_NORM)\n",
+    "print(\"PACKING           =\", PACKING)\n",
+    "print(\"GRADIENT_CHECKPOINTING =\", GRADIENT_CHECKPOINTING)\n",
+    "print(\"LOW_CPU_MEM_USAGE =\", LOW_CPU_MEM_USAGE)\n",
+    "print(\"USE_FLASH_ATTENTION_2_IF_AVAILABLE =\", USE_FLASH_ATTENTION_2_IF_AVAILABLE)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9319138",
+   "metadata": {},
+   "source": [
+    "## Part 4 - Helper functions\n",
+    "\n",
+    "This section defines utility functions for paths, logging, text cleanup, device selection, dtype selection, and 4-bit control.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f4a0e4f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ensure_dir(path: Path) -> None:\n",
+    "    path.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "def print_banner(title: str) -> None:\n",
+    "    print(\"\\n\" + \"=\" * 88)\n",
+    "    print(title)\n",
+    "    print(\"=\" * 88)\n",
+    "\n",
+    "def select_runtime_backend() -> str:\n",
+    "    if torch.cuda.is_available():\n",
+    "        return \"cuda\"\n",
+    "    if hasattr(torch.backends, \"mps\") and torch.backends.mps.is_available():\n",
+    "        return \"mps\"\n",
+    "    return \"cpu\"\n",
+    "\n",
+    "def detect_compute_dtype(backend: str) -> torch.dtype:\n",
+    "    if backend == \"cuda\":\n",
+    "        if torch.cuda.is_bf16_supported():\n",
+    "            return torch.bfloat16\n",
+    "        return torch.float16\n",
+    "    if backend == \"mps\":\n",
+    "        return torch.float16\n",
+    "    return torch.float32\n",
+    "\n",
+    "def should_use_4bit(backend: str) -> bool:\n",
+    "    return USE_4BIT and backend == \"cuda\"\n",
+    "\n",
+    "def normalize_text(text: str) -> str:\n",
+    "    text = text.strip()\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    return text\n",
+    "\n",
+    "def normalize_for_exact(text: str) -> str:\n",
+    "    text = text.lower().strip()\n",
+    "    text = re.sub(r\"[^\\w\\s]\", \" \", text)\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    return text\n",
+    "\n",
+    "def cleanup_memory() -> None:\n",
+    "    gc.collect()\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8d9c423",
+   "metadata": {},
+   "source": [
+    "## Part 5 - Device detection and display\n",
+    "\n",
+    "This section prints the available devices, selected backend, selected dtype, and whether 4-bit loading is enabled.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "64398701",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Device Detection\n",
+      "========================================================================================\n",
+      "{\n",
+      "  \"selected_backend\": \"cuda\",\n",
+      "  \"torch_dtype\": \"torch.bfloat16\",\n",
+      "  \"use_4bit_qlora\": true,\n",
+      "  \"cuda_available\": true,\n",
+      "  \"mps_available\": false,\n",
+      "  \"cuda_device_count\": 1,\n",
+      "  \"cuda_device_name\": \"NVIDIA A100-SXM4-80GB\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_banner(\"Device Detection\")\n",
+    "\n",
+    "RUNTIME_DEVICE_BACKEND = select_runtime_backend()\n",
+    "effective_dtype = detect_compute_dtype(RUNTIME_DEVICE_BACKEND)\n",
+    "effective_use_4bit = should_use_4bit(RUNTIME_DEVICE_BACKEND)\n",
+    "\n",
+    "device_info = {\n",
+    "    \"selected_backend\": RUNTIME_DEVICE_BACKEND,\n",
+    "    \"torch_dtype\": str(effective_dtype),\n",
+    "    \"use_4bit_qlora\": effective_use_4bit,\n",
+    "    \"cuda_available\": torch.cuda.is_available(),\n",
+    "    \"mps_available\": hasattr(torch.backends, \"mps\") and torch.backends.mps.is_available(),\n",
+    "}\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    device_info[\"cuda_device_count\"] = torch.cuda.device_count()\n",
+    "    try:\n",
+    "        device_info[\"cuda_device_name\"] = torch.cuda.get_device_name(0)\n",
+    "    except Exception:\n",
+    "        device_info[\"cuda_device_name\"] = \"Unavailable\"\n",
+    "\n",
+    "print(json.dumps(device_info, indent=2))\n",
+    "\n",
+    "if RUNTIME_DEVICE_BACKEND != \"cuda\":\n",
+    "    print(\n",
+    "        \"\\\\n[Info] Non-CUDA backend detected. 4-bit bitsandbytes QLoRA is disabled automatically, \"\n",
+    "        \"and the training path falls back to standard LoRA on the selected backend.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69b02751",
+   "metadata": {},
+   "source": [
+    "## Part 6 - Evaluation functions\n",
+    "\n",
+    "This section defines exact-match and token-level F1 scoring functions.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "379c9dd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def token_f1(prediction: str, reference: str) -> float:\n",
+    "    pred_tokens = normalize_for_exact(prediction).split()\n",
+    "    ref_tokens = normalize_for_exact(reference).split()\n",
+    "\n",
+    "    if not pred_tokens and not ref_tokens:\n",
+    "        return 1.0\n",
+    "    if not pred_tokens or not ref_tokens:\n",
+    "        return 0.0\n",
+    "\n",
+    "    common = {}\n",
+    "    for token in pred_tokens:\n",
+    "        common[token] = common.get(token, 0) + 1\n",
+    "\n",
+    "    overlap = 0\n",
+    "    ref_counts = {}\n",
+    "    for token in ref_tokens:\n",
+    "        ref_counts[token] = ref_counts.get(token, 0) + 1\n",
+    "\n",
+    "    for token, count in common.items():\n",
+    "        if token in ref_counts:\n",
+    "            overlap += min(count, ref_counts[token])\n",
+    "\n",
+    "    if overlap == 0:\n",
+    "        return 0.0\n",
+    "\n",
+    "    precision = overlap / len(pred_tokens)\n",
+    "    recall = overlap / len(ref_tokens)\n",
+    "    return 2 * precision * recall / (precision + recall)\n",
+    "\n",
+    "def exact_match(prediction: str, reference: str) -> float:\n",
+    "    return float(normalize_for_exact(prediction) == normalize_for_exact(reference))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d26e26fd",
+   "metadata": {},
+   "source": [
+    "## Part 7 - Dataset reading and validation functions\n",
+    "\n",
+    "This section reads the JSONL file, checks required fields, and prepares prompt-completion rows.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "56287c34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_jsonl(path: Path) -> List[Dict]:\n",
+    "    rows: List[Dict] = []\n",
+    "    with path.open(\"r\", encoding=\"utf-8\") as f:\n",
+    "        for line_number, line in enumerate(f, 1):\n",
+    "            line = line.strip()\n",
+    "            if not line:\n",
+    "                continue\n",
+    "            obj = json.loads(line)\n",
+    "            required = {\"question\", \"answer\"}\n",
+    "            missing = required - set(obj.keys())\n",
+    "            if missing:\n",
+    "                raise ValueError(f\"Line {line_number} is missing keys: {sorted(missing)}\")\n",
+    "            obj[\"question\"] = normalize_text(obj[\"question\"])\n",
+    "            obj[\"answer\"] = normalize_text(obj[\"answer\"])\n",
+    "            rows.append(obj)\n",
+    "\n",
+    "    if not rows:\n",
+    "        raise ValueError(f\"Dataset at {path} is empty.\")\n",
+    "    return rows\n",
+    "\n",
+    "def build_prompt_completion_rows(rows: List[Dict]) -> List[Dict]:\n",
+    "    converted: List[Dict] = []\n",
+    "    for row in rows:\n",
+    "        converted.append(\n",
+    "            {\n",
+    "                \"qa_id\": row.get(\"qa_id\", \"\"),\n",
+    "                \"index_id\": row.get(\"index_id\", \"\"),\n",
+    "                \"prompt\": [\n",
+    "                    {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "                    {\"role\": \"user\", \"content\": row[\"question\"]},\n",
+    "                ],\n",
+    "                \"completion\": [\n",
+    "                    {\"role\": \"assistant\", \"content\": row[\"answer\"]},\n",
+    "                ],\n",
+    "                \"question\": row[\"question\"],\n",
+    "                \"answer\": row[\"answer\"],\n",
+    "            }\n",
+    "        )\n",
+    "    return converted\n",
+    "\n",
+    "def split_dataset(rows: List[Dict], train_ratio: float, seed: int) -> DatasetDict:\n",
+    "    rng = random.Random(seed)\n",
+    "    rows = rows.copy()\n",
+    "    rng.shuffle(rows)\n",
+    "\n",
+    "    split_idx = max(1, int(len(rows) * train_ratio))\n",
+    "    split_idx = min(split_idx, len(rows) - 1)\n",
+    "\n",
+    "    train_rows = rows[:split_idx]\n",
+    "    val_rows = rows[split_idx:]\n",
+    "\n",
+    "    ds = DatasetDict(\n",
+    "        {\n",
+    "            \"train\": Dataset.from_list(train_rows),\n",
+    "            \"validation\": Dataset.from_list(val_rows),\n",
+    "        }\n",
+    "    )\n",
+    "    return ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53f03f17",
+   "metadata": {},
+   "source": [
+    "## Part 8 - Read and inspect the dataset\n",
+    "\n",
+    "This section loads the dataset, creates the train / validation split, and prints one sample.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "943f8ae3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 1 - Validate dataset\n",
+      "========================================================================================\n",
+      "{\n",
+      "  \"dataset_path\": \"/scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl\",\n",
+      "  \"total_examples\": 388,\n",
+      "  \"train_examples\": 349,\n",
+      "  \"validation_examples\": 39,\n",
+      "  \"seed\": 42,\n",
+      "  \"train_val_ratio\": 0.9\n",
+      "}\n",
+      "\\nSample example:\n",
+      "{\n",
+      "  \"qa_id\": \"qa_000001\",\n",
+      "  \"index_id\": \"UMI-0001\",\n",
+      "  \"prompt\": [\n",
+      "    {\n",
+      "      \"role\": \"system\",\n",
+      "      \"content\": \"You are an academic assistant for the Faculty of Computer Science and Information Technology, Universiti Malaya. Answer questions accurately and only using handbook-consistent information. If the handbook does not support a claim, avoid inventing details.\"\n",
+      "    },\n",
+      "    {\n",
+      "      \"role\": \"user\",\n",
+      "      \"content\": \"What are the faculty objectives?\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"completion\": [\n",
+      "    {\n",
+      "      \"role\": \"assistant\",\n",
+      "      \"content\": \"The faculty objectives are to sustain excellence in undergraduate and postgraduate teaching, learning, and research; contribute to national development through quality research and publications; provide innovative academic programmes that respond to societal needs; and produce quality graduates with advanced knowledge and skills in computer science and information technology.\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"question\": \"What are the faculty objectives?\",\n",
+      "  \"answer\": \"The faculty objectives are to sustain excellence in undergraduate and postgraduate teaching, learning, and research; contribute to national development through quality research and publications; provide innovative academic programmes that respond to societal needs; and produce quality graduates with advanced knowledge and skills in computer science and information technology.\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_banner(\"Step 1 - Validate dataset\")\n",
+    "\n",
+    "if not DATASET_PATH.exists():\n",
+    "    raise FileNotFoundError(\n",
+    "        f\"Dataset not found: {DATASET_PATH}\\n\"\n",
+    "        f\"Place SFT_QA_Training_Ready.jsonl in DATASET_ROOT or update DATASET_PATH.\"\n",
+    "    )\n",
+    "\n",
+    "set_seed(RANDOM_SEED)\n",
+    "random.seed(RANDOM_SEED)\n",
+    "np.random.seed(RANDOM_SEED)\n",
+    "\n",
+    "raw_rows = load_jsonl(DATASET_PATH)\n",
+    "converted_rows = build_prompt_completion_rows(raw_rows)\n",
+    "dataset_dict = split_dataset(converted_rows, TRAIN_VAL_RATIO, RANDOM_SEED)\n",
+    "\n",
+    "split_summary = {\n",
+    "    \"dataset_path\": str(DATASET_PATH),\n",
+    "    \"total_examples\": len(converted_rows),\n",
+    "    \"train_examples\": len(dataset_dict[\"train\"]),\n",
+    "    \"validation_examples\": len(dataset_dict[\"validation\"]),\n",
+    "    \"seed\": RANDOM_SEED,\n",
+    "    \"train_val_ratio\": TRAIN_VAL_RATIO,\n",
+    "}\n",
+    "\n",
+    "print(json.dumps(split_summary, indent=2, ensure_ascii=False))\n",
+    "print(\"\\\\nSample example:\")\n",
+    "print(json.dumps(converted_rows[0], indent=2, ensure_ascii=False)[:1800])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1589862e",
+   "metadata": {},
+   "source": [
+    "## Part 9 - Save the dataset split summary\n",
+    "\n",
+    "This section writes the split summary to JSON.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "61b5ae46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved split summary to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/dataset_split_summary.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "def save_json(path: Path, obj: Dict) -> None:\n",
+    "    ensure_dir(path.parent)\n",
+    "    with path.open(\"w\", encoding=\"utf-8\") as f:\n",
+    "        json.dump(obj, f, indent=2, ensure_ascii=False)\n",
+    "\n",
+    "def save_predictions_jsonl(path: Path, rows: List[Dict]) -> None:\n",
+    "    ensure_dir(path.parent)\n",
+    "    with path.open(\"w\", encoding=\"utf-8\") as f:\n",
+    "        for row in rows:\n",
+    "            f.write(json.dumps(row, ensure_ascii=False) + \"\\\\n\")\n",
+    "\n",
+    "ensure_dir(OUTPUT_ROOT)\n",
+    "save_json(TRAIN_VAL_SPLIT_JSON_PATH, split_summary)\n",
+    "print(f\"Saved split summary to: {TRAIN_VAL_SPLIT_JSON_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79e53130",
+   "metadata": {},
+   "source": [
+    "## Part 10 - Download the base model into a local directory\n",
+    "\n",
+    "This section reuses the local model if it already exists; otherwise it downloads the base model to the configured model directory.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "73f9a585",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model already exists at: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n",
+      "Local model path: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n"
+     ]
+    }
+   ],
+   "source": [
+    "def download_base_model_if_needed() -> Path:\n",
+    "    ensure_dir(BASE_MODEL_LOCAL_DIR)\n",
+    "\n",
+    "    if (BASE_MODEL_LOCAL_DIR / \"config.json\").exists():\n",
+    "        print(f\"Base model already exists at: {BASE_MODEL_LOCAL_DIR}\")\n",
+    "        return BASE_MODEL_LOCAL_DIR\n",
+    "\n",
+    "    print_banner(\"Downloading base model snapshot\")\n",
+    "    local_path = snapshot_download(\n",
+    "        repo_id=BASE_MODEL_NAME,\n",
+    "        local_dir=str(BASE_MODEL_LOCAL_DIR),\n",
+    "        local_dir_use_symlinks=False,\n",
+    "        resume_download=True,\n",
+    "    )\n",
+    "    print(f\"Downloaded base model to: {local_path}\")\n",
+    "    return BASE_MODEL_LOCAL_DIR\n",
+    "\n",
+    "local_model_path = download_base_model_if_needed()\n",
+    "print(f\"Local model path: {local_model_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94be6680",
+   "metadata": {},
+   "source": [
+    "## Part 11 - Load the tokenizer and training model\n",
+    "\n",
+    "This section loads the tokenizer and model, enables 4-bit QLoRA on CUDA, and falls back to standard LoRA on MPS / CPU.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "b6771a4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 4 - Load tokenizer and model\n",
+      "========================================================================================\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8e4cf58c2d243ddb9fbf786788b5037",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokenizer and model loaded successfully.\n",
+      "Model class: Qwen3ForCausalLM\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_tokenizer(model_path: Path):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(str(model_path), use_fast=True)\n",
+    "    if tokenizer.pad_token is None:\n",
+    "        tokenizer.pad_token = tokenizer.eos_token\n",
+    "    tokenizer.padding_side = \"right\"\n",
+    "    return tokenizer\n",
+    "\n",
+    "def load_model_for_training(model_path: Path, backend: str):\n",
+    "    compute_dtype = detect_compute_dtype(backend)\n",
+    "\n",
+    "    quantization_config = None\n",
+    "    if should_use_4bit(backend):\n",
+    "        quantization_config = BitsAndBytesConfig(\n",
+    "            load_in_4bit=True,\n",
+    "            bnb_4bit_use_double_quant=True,\n",
+    "            bnb_4bit_quant_type=\"nf4\",\n",
+    "            bnb_4bit_compute_dtype=compute_dtype,\n",
+    "        )\n",
+    "\n",
+    "    model_kwargs = {\n",
+    "        \"pretrained_model_name_or_path\": str(model_path),\n",
+    "        \"torch_dtype\": compute_dtype,\n",
+    "        \"low_cpu_mem_usage\": LOW_CPU_MEM_USAGE,\n",
+    "        \"trust_remote_code\": False,\n",
+    "    }\n",
+    "\n",
+    "    if backend == \"cuda\":\n",
+    "        model_kwargs[\"device_map\"] = \"auto\"\n",
+    "    if quantization_config is not None:\n",
+    "        model_kwargs[\"quantization_config\"] = quantization_config\n",
+    "    if USE_FLASH_ATTENTION_2_IF_AVAILABLE and backend == \"cuda\":\n",
+    "        model_kwargs[\"attn_implementation\"] = \"flash_attention_2\"\n",
+    "\n",
+    "    model = AutoModelForCausalLM.from_pretrained(**model_kwargs)\n",
+    "\n",
+    "    if backend in {\"mps\", \"cpu\"}:\n",
+    "        model = model.to(backend)\n",
+    "\n",
+    "    model.config.use_cache = False if GRADIENT_CHECKPOINTING else True\n",
+    "    if GRADIENT_CHECKPOINTING:\n",
+    "        model.gradient_checkpointing_enable()\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "print_banner(\"Step 4 - Load tokenizer and model\")\n",
+    "tokenizer = load_tokenizer(local_model_path)\n",
+    "model = load_model_for_training(local_model_path, RUNTIME_DEVICE_BACKEND)\n",
+    "\n",
+    "print(\"Tokenizer and model loaded successfully.\")\n",
+    "print(\"Model class:\", model.__class__.__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "192160bb",
+   "metadata": {},
+   "source": [
+    "## Part 12 - Build the LoRA configuration and training arguments\n",
+    "\n",
+    "This section defines the LoRA settings and trainer arguments.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "38a4cc45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 5 - Build trainer\n",
+      "========================================================================================\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c1da0d8fb454d9bb3ca83a8f871a8b5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Tokenizing train dataset (num_proc=1):   0%|          | 0/349 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e0b96c79ace94af39e6018adea9da222",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Tokenizing eval dataset (num_proc=1):   0%|          | 0/39 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trainer built successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "def build_peft_config() -> LoraConfig:\n",
+    "    return LoraConfig(\n",
+    "        r=LORA_R,\n",
+    "        lora_alpha=LORA_ALPHA,\n",
+    "        lora_dropout=LORA_DROPOUT,\n",
+    "        bias=\"none\",\n",
+    "        task_type=\"CAUSAL_LM\",\n",
+    "        target_modules=LORA_TARGET_MODULES,\n",
+    "    )\n",
+    "\n",
+    "def build_training_args(backend: str) -> SFTConfig:\n",
+    "    bf16 = backend == \"cuda\" and torch.cuda.is_bf16_supported()\n",
+    "    fp16 = backend in {\"cuda\", \"mps\"} and not bf16\n",
+    "\n",
+    "    return SFTConfig(\n",
+    "        output_dir=str(OUTPUT_ROOT / \"trainer_runs\"),\n",
+    "        num_train_epochs=NUM_TRAIN_EPOCHS,\n",
+    "        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,\n",
+    "        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,\n",
+    "        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
+    "        learning_rate=LEARNING_RATE,\n",
+    "        weight_decay=WEIGHT_DECAY,\n",
+    "        warmup_steps=WARMUP_STEPS,\n",
+    "        logging_steps=LOGGING_STEPS,\n",
+    "        eval_strategy=\"steps\",\n",
+    "        eval_steps=EVAL_STEPS,\n",
+    "        save_strategy=\"steps\",\n",
+    "        save_steps=SAVE_STEPS,\n",
+    "        save_total_limit=2,\n",
+    "        load_best_model_at_end=True,\n",
+    "        metric_for_best_model=\"eval_loss\",\n",
+    "        greater_is_better=False,\n",
+    "        max_grad_norm=MAX_GRAD_NORM,\n",
+    "        lr_scheduler_type=\"cosine\",\n",
+    "        bf16=bf16,\n",
+    "        fp16=fp16,\n",
+    "        gradient_checkpointing=GRADIENT_CHECKPOINTING,\n",
+    "        max_length=MAX_SEQ_LENGTH,\n",
+    "        packing=PACKING,\n",
+    "        dataset_num_proc=1,\n",
+    "        completion_only_loss=True,\n",
+    "        remove_unused_columns=False,\n",
+    "        report_to=\"none\",\n",
+    "        seed=RANDOM_SEED,\n",
+    "        optim=\"paged_adamw_8bit\" if should_use_4bit(backend) else \"adamw_torch\",\n",
+    "    )\n",
+    "\n",
+    "def build_trainer(model, tokenizer, dataset_dict: DatasetDict, backend: str) -> SFTTrainer:\n",
+    "    peft_config = build_peft_config()\n",
+    "    training_args = build_training_args(backend)\n",
+    "\n",
+    "    trainer = SFTTrainer(\n",
+    "        model=model,\n",
+    "        processing_class=tokenizer,\n",
+    "        args=training_args,\n",
+    "        train_dataset=dataset_dict[\"train\"],\n",
+    "        eval_dataset=dataset_dict[\"validation\"],\n",
+    "        peft_config=peft_config,\n",
+    "    )\n",
+    "    return trainer\n",
+    "\n",
+    "print_banner(\"Step 5 - Build trainer\")\n",
+    "trainer = build_trainer(model, tokenizer, dataset_dict, RUNTIME_DEVICE_BACKEND)\n",
+    "print(\"Trainer built successfully.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7892f31",
+   "metadata": {},
+   "source": [
+    "## Part 13 - Start training\n",
+    "\n",
+    "This section starts fine-tuning.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "fee6890b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 6 - Train\n",
+      "========================================================================================\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='132' max='132' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [132/132 11:01, Epoch 6/6]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>0.300081</td>\n",
+       "      <td>1.132511</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>0.025233</td>\n",
+       "      <td>1.302579</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train metrics:\n",
+      "{\n",
+      "  \"train_runtime\": 666.4624,\n",
+      "  \"train_samples_per_second\": 3.142,\n",
+      "  \"train_steps_per_second\": 0.198,\n",
+      "  \"total_flos\": 1.4419846776152064e+16,\n",
+      "  \"train_loss\": 0.26707774019715463\n",
+      "}\n",
+      "Adapter saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/lora_adapter\n",
+      "Train stage minutes: 11.13\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_banner(\"Step 6 - Train\")\n",
+    "train_start_time = time.time()\n",
+    "\n",
+    "train_result = trainer.train()\n",
+    "trainer.save_model(str(ADAPTER_OUTPUT_DIR))\n",
+    "tokenizer.save_pretrained(str(ADAPTER_OUTPUT_DIR))\n",
+    "\n",
+    "train_metrics = train_result.metrics\n",
+    "print(\"Train metrics:\")\n",
+    "print(json.dumps(train_metrics, indent=2, default=str))\n",
+    "\n",
+    "print(f\"Adapter saved to: {ADAPTER_OUTPUT_DIR}\")\n",
+    "print(f\"Train stage minutes: {(time.time() - train_start_time)/60:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a64c5b7",
+   "metadata": {},
+   "source": [
+    "## Part 14 - Teacher-forced loss evaluation\n",
+    "\n",
+    "This section computes `eval_loss` and `perplexity` on the validation set.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7ceb96b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 7 - Evaluate teacher-forced loss\n",
+      "========================================================================================\n",
+      "Eval loss  : 1.132510781288147\n",
+      "Perplexity : 3.1034387822556253\n",
+      "{'eval_loss': 1.132510781288147, 'eval_runtime': 2.8459, 'eval_samples_per_second': 13.704, 'eval_steps_per_second': 7.028}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_banner(\"Step 7 - Evaluate teacher-forced loss\")\n",
+    "\n",
+    "# Remove notebook progress callback to avoid Jupyter evaluate callback error\n",
+    "trainer.remove_callback(transformers.utils.notebook.NotebookProgressCallback)\n",
+    "\n",
+    "eval_metrics = trainer.evaluate()\n",
+    "eval_loss = float(eval_metrics.get(\"eval_loss\", float(\"nan\")))\n",
+    "perplexity = float(math.exp(min(eval_loss, 20))) if math.isfinite(eval_loss) else float(\"nan\")\n",
+    "\n",
+    "print(\"Eval loss  :\", eval_loss)\n",
+    "print(\"Perplexity :\", perplexity)\n",
+    "print(eval_metrics)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0da67d71",
+   "metadata": {},
+   "source": [
+    "## Part 15 - Generation evaluation functions\n",
+    "\n",
+    "This section defines generation-based evaluation functions and metric calculation.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "eff9034b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_eval_prompt(tokenizer, question: str) -> str:\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": question},\n",
+    "    ]\n",
+    "    return tokenizer.apply_chat_template(\n",
+    "        messages,\n",
+    "        tokenize=False,\n",
+    "        add_generation_prompt=True,\n",
+    "    )\n",
+    "\n",
+    "@torch.inference_mode()\n",
+    "def generate_answers(model, tokenizer, questions: List[str], max_new_tokens: int) -> List[str]:\n",
+    "    device = next(model.parameters()).device\n",
+    "    prompts = [format_eval_prompt(tokenizer, q) for q in questions]\n",
+    "    outputs: List[str] = []\n",
+    "\n",
+    "    for prompt in prompts:\n",
+    "        encoded = tokenizer(\n",
+    "            prompt,\n",
+    "            return_tensors=\"pt\",\n",
+    "            truncation=True,\n",
+    "            max_length=MAX_SEQ_LENGTH,\n",
+    "        )\n",
+    "        encoded = {k: v.to(device) for k, v in encoded.items()}\n",
+    "\n",
+    "        generated = model.generate(\n",
+    "            **encoded,\n",
+    "            max_new_tokens=max_new_tokens,\n",
+    "            do_sample=False,\n",
+    "            temperature=None,\n",
+    "            top_p=None,\n",
+    "            repetition_penalty=1.05,\n",
+    "            pad_token_id=tokenizer.pad_token_id,\n",
+    "            eos_token_id=tokenizer.eos_token_id,\n",
+    "        )\n",
+    "\n",
+    "        gen_only = generated[0][encoded[\"input_ids\"].shape[1]:]\n",
+    "        text = tokenizer.decode(gen_only, skip_special_tokens=True)\n",
+    "        outputs.append(normalize_text(text))\n",
+    "\n",
+    "    return outputs\n",
+    "\n",
+    "def compute_generation_metrics(predictions: List[str], references: List[str]) -> Dict[str, float]:\n",
+    "    import evaluate\n",
+    "    import sacrebleu\n",
+    "\n",
+    "    rouge = evaluate.load(\"rouge\")\n",
+    "    \n",
+    "\n",
+    "    rouge_scores = rouge.compute(predictions=predictions, references=references)\n",
+    "    \n",
+    "\n",
+    "    sacrebleu_score = sacrebleu.corpus_bleu(predictions, [references]).score\n",
+    "    chrf_score = sacrebleu.corpus_chrf(predictions, [references], word_order=2).score\n",
+    "\n",
+    "    em = float(np.mean([exact_match(p, r) for p, r in zip(predictions, references)]))\n",
+    "    tf1 = float(np.mean([token_f1(p, r) for p, r in zip(predictions, references)]))\n",
+    "    avg_pred_len = float(np.mean([len(p.split()) for p in predictions])) if predictions else 0.0\n",
+    "    avg_ref_len = float(np.mean([len(r.split()) for r in references])) if references else 0.0\n",
+    "\n",
+    "    metrics = {\n",
+    "        \"exact_match\": em,\n",
+    "        \"token_f1\": tf1,\n",
+    "        \"rouge1\": float(rouge_scores[\"rouge1\"]),\n",
+    "        \"rouge2\": float(rouge_scores[\"rouge2\"]),\n",
+    "        \"rougeL\": float(rouge_scores[\"rougeL\"]),\n",
+    "        \"bertscore_f1\": None,\n",
+    "        \"sacrebleu\": float(sacrebleu_score),\n",
+    "        \"chrf_pp\": float(chrf_score),\n",
+    "        \"avg_prediction_words\": avg_pred_len,\n",
+    "        \"avg_reference_words\": avg_ref_len,\n",
+    "    }\n",
+    "    return metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9298597",
+   "metadata": {},
+   "source": [
+    "## Part 16 - Run final generation evaluation on the validation set\n",
+    "\n",
+    "This section generates answers on the validation set, computes metrics, saves predictions, and prints a few samples.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "995cd0ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 8 - Final generation evaluation on validation split\n",
+      "========================================================================================\n",
+      "Generation metrics:\n",
+      "{\n",
+      "  \"exact_match\": 0.0,\n",
+      "  \"token_f1\": 0.5181110282406411,\n",
+      "  \"rouge1\": 0.5171361078676141,\n",
+      "  \"rouge2\": 0.33460021476687485,\n",
+      "  \"rougeL\": 0.45557456154376447,\n",
+      "  \"bertscore_f1\": null,\n",
+      "  \"sacrebleu\": 31.010919781258593,\n",
+      "  \"chrf_pp\": 49.920320261813664,\n",
+      "  \"avg_prediction_words\": 36.0,\n",
+      "  \"avg_reference_words\": 36.69230769230769\n",
+      "}\n",
+      "\n",
+      "========================================================================================\n",
+      "Sample validation predictions\n",
+      "========================================================================================\n",
+      "\n",
+      "[Q] What courses are listed under programme core courses?\n",
+      "[REF] The handbook presents programme core courses as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for the programme core component.\n",
+      "[PRED] <think> </think> The handbook presents programme core courses as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for that part of the programme.\n",
+      "[EM=0, TokenF1=0.8772]\n",
+      "\n",
+      "[Q] What responsibilities do supervisors have for guidance, feedback, and assessment in the academic project?\n",
+      "[REF] Supervisors are responsible for assigning or confirming project titles in the ilmiah system, supervising and coaching students, meeting them regularly, verifying the progress logbook, evaluating reports, and entering marks in the ilmiah system. More broadly, the handbook frames supervisors as one of the key parties responsible for guidance, monitoring, feedback, and assessment in the academic project process.\n",
+      "[PRED] <think> </think> Supervisors are responsible for providing regular guidance, giving feedback on progress and submissions, carrying out viva or panel-based assessment where applicable, and submitting the final report through the ilmiah system for administration to issue the mark.\n",
+      "[EM=0, TokenF1=0.3542]\n",
+      "\n",
+      "[Q] What information is given about the faculty cafeteria?\n",
+      "[REF] The cafeteria is located at the back of Block A.\n",
+      "[PRED] <think> </think> The handbook states that the faculty cafeteria serves both staff and students and that it is open to all UM students.\n",
+      "[EM=0, TokenF1=0.2424]\n",
+      "\n",
+      "[Q] What dress expectations are illustrated for official events?\n",
+      "[REF] For official events, the poster illustrates formal attire, including suit-style clothing and traditional formal wear, to convey a neat and official appearance appropriate for formal university occasions.\n",
+      "[PRED] <think> </think> For official events, men are expected to follow formal or semi-formal Western business attire, while women should also aim for formal or appropriate Western office or ceremonial clothing.\n",
+      "[EM=0, TokenF1=0.3729]\n",
+      "\n",
+      "[Q] What courses are listed under specialization elective courses - artificial intelligence?\n",
+      "[REF] The handbook presents the Artificial Intelligence specialization electives as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for that specialization.\n",
+      "[PRED] <think> </think> The specialization elective section is intended to show the elective pool available for that track. Students should use it as a selection list of approved course codes they can choose from, following the shown curriculum structure and any stated university or faculty rules for that programme.\n",
+      "[EM=0, TokenF1=0.3467]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_banner(\"Step 8 - Final generation evaluation on validation split\")\n",
+    "\n",
+    "final_metrics = {\n",
+    "    \"teacher_forced_eval\": eval_metrics,\n",
+    "    \"perplexity\": perplexity,\n",
+    "}\n",
+    "\n",
+    "prediction_rows = []\n",
+    "\n",
+    "validation_questions = dataset_dict[\"validation\"][\"question\"]\n",
+    "validation_answers = dataset_dict[\"validation\"][\"answer\"]\n",
+    "\n",
+    "predictions = generate_answers(\n",
+    "    model=trainer.model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    questions=validation_questions,\n",
+    "    max_new_tokens=MAX_NEW_TOKENS_EVAL,\n",
+    ")\n",
+    "\n",
+    "generation_metrics = compute_generation_metrics(predictions, validation_answers)\n",
+    "final_metrics[\"generation_metrics\"] = generation_metrics\n",
+    "\n",
+    "for i, (question, reference, prediction) in enumerate(\n",
+    "    zip(validation_questions, validation_answers, predictions)\n",
+    "):\n",
+    "    prediction_rows.append(\n",
+    "        {\n",
+    "            \"row_id\": i,\n",
+    "            \"question\": question,\n",
+    "            \"reference_answer\": reference,\n",
+    "            \"predicted_answer\": prediction,\n",
+    "            \"exact_match\": exact_match(prediction, reference),\n",
+    "            \"token_f1\": token_f1(prediction, reference),\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "save_predictions_jsonl(PREDICTIONS_JSONL_PATH, prediction_rows)\n",
+    "\n",
+    "print(\"Generation metrics:\")\n",
+    "print(json.dumps(generation_metrics, indent=2, ensure_ascii=False))\n",
+    "\n",
+    "print_banner(\"Sample validation predictions\")\n",
+    "for row in prediction_rows[:NUM_PRINTED_PREDICTIONS]:\n",
+    "    print(f\"\\n[Q] {row['question']}\")\n",
+    "    print(f\"[REF] {row['reference_answer']}\")\n",
+    "    print(f\"[PRED] {row['predicted_answer']}\")\n",
+    "    print(f\"[EM={row['exact_match']:.0f}, TokenF1={row['token_f1']:.4f}]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "255eb7de",
+   "metadata": {},
+   "source": [
+    "## Part 17 - Save metrics\n",
+    "\n",
+    "This section writes the current metrics to JSON.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "ebd241d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Metrics saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/final_metrics.json\n",
+      "Predictions saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/validation_predictions.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "save_json(METRICS_JSON_PATH, final_metrics)\n",
+    "print(f\"Metrics saved to: {METRICS_JSON_PATH}\")\n",
+    "print(f\"Predictions saved to: {PREDICTIONS_JSONL_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "429b8fea",
+   "metadata": {},
+   "source": [
+    "## Part 18 - Merge the LoRA adapter and export the final model\n",
+    "\n",
+    "This section reloads the base model, merges the LoRA adapter, saves the merged model directory, and optionally exports a `.pt` file.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "720f1089",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Step 9 - Save merged model\n",
+      "========================================================================================\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00dc5873f54b4054853f7908bd366489",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5c8f48a945a84fa5b75150c6cb3939d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Merged model saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/merged_model\n",
+      "\n",
+      "========================================================================================\n",
+      "Saving single .pt state_dict export\n",
+      "========================================================================================\n",
+      "Saved .pt file to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/Qwen3-8B-Instruct_UM_Handbook.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_base_model_for_merge(model_path: Path, backend: str):\n",
+    "    compute_dtype = detect_compute_dtype(backend)\n",
+    "    model_kwargs = {\n",
+    "        \"pretrained_model_name_or_path\": str(model_path),\n",
+    "        \"torch_dtype\": compute_dtype,\n",
+    "        \"low_cpu_mem_usage\": LOW_CPU_MEM_USAGE,\n",
+    "        \"trust_remote_code\": False,\n",
+    "    }\n",
+    "    if backend == \"cuda\":\n",
+    "        model_kwargs[\"device_map\"] = \"auto\"\n",
+    "    model = AutoModelForCausalLM.from_pretrained(**model_kwargs)\n",
+    "    if backend in {\"mps\", \"cpu\"}:\n",
+    "        model = model.to(backend)\n",
+    "    return model\n",
+    "\n",
+    "def save_single_pt_state_dict(model, path: Path) -> None:\n",
+    "    print_banner(\"Saving single .pt state_dict export\")\n",
+    "    ensure_dir(path.parent)\n",
+    "\n",
+    "    cpu_state_dict = {}\n",
+    "    for key, value in model.state_dict().items():\n",
+    "        cpu_state_dict[key] = value.detach().cpu()\n",
+    "\n",
+    "    torch.save(\n",
+    "        {\n",
+    "            \"model_state_dict\": cpu_state_dict,\n",
+    "            \"base_model_name\": BASE_MODEL_NAME,\n",
+    "            \"system_prompt\": SYSTEM_PROMPT,\n",
+    "            \"max_seq_length\": MAX_SEQ_LENGTH,\n",
+    "        },\n",
+    "        str(path),\n",
+    "    )\n",
+    "    print(f\"Saved .pt file to: {path}\")\n",
+    "\n",
+    "print_banner(\"Step 9 - Save merged model\")\n",
+    "cleanup_memory()\n",
+    "\n",
+    "if SAVE_MERGED_MODEL:\n",
+    "    base_model_for_merge = load_base_model_for_merge(local_model_path, RUNTIME_DEVICE_BACKEND)\n",
+    "    merged_model = PeftModel.from_pretrained(base_model_for_merge, str(ADAPTER_OUTPUT_DIR))\n",
+    "    merged_model = merged_model.merge_and_unload()\n",
+    "\n",
+    "    ensure_dir(MERGED_MODEL_DIR)\n",
+    "    merged_model.save_pretrained(str(MERGED_MODEL_DIR), safe_serialization=True)\n",
+    "\n",
+    "    if SAVE_TOKENIZER_WITH_MERGED:\n",
+    "        tokenizer.save_pretrained(str(MERGED_MODEL_DIR))\n",
+    "\n",
+    "    print(f\"Merged model saved to: {MERGED_MODEL_DIR}\")\n",
+    "\n",
+    "    if SAVE_SINGLE_PT:\n",
+    "        save_single_pt_state_dict(merged_model, FINAL_PT_PATH)\n",
+    "\n",
+    "    del merged_model\n",
+    "    del base_model_for_merge\n",
+    "    cleanup_memory()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2973f9b",
+   "metadata": {},
+   "source": [
+    "## Part 19 - End-of-training summary\n",
+    "\n",
+    "This section prints the final output paths.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "6891902c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========================================================================================\n",
+      "Done\n",
+      "========================================================================================\n",
+      "Selected backend: cuda\n",
+      "Adapter directory: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/lora_adapter\n",
+      "Merged model directory: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/merged_model\n",
+      "Single .pt file: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/Qwen3-8B-Instruct_UM_Handbook.pt\n",
+      "Metrics JSON: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/final_metrics.json\n",
+      "Predictions JSONL: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/validation_predictions.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "total_runtime_minutes = None\n",
+    "try:\n",
+    "    # 如果 notebook 从头开始运行，这个变量就存在\n",
+    "    total_runtime_minutes = \"See notebook runtime from execution order / timestamps\"\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
+    "final_metrics[\"completion_note\"] = \"Notebook execution completed.\"\n",
+    "save_json(METRICS_JSON_PATH, final_metrics)\n",
+    "\n",
+    "print_banner(\"Done\")\n",
+    "print(f\"Selected backend: {RUNTIME_DEVICE_BACKEND}\")\n",
+    "print(f\"Adapter directory: {ADAPTER_OUTPUT_DIR}\")\n",
+    "print(f\"Merged model directory: {MERGED_MODEL_DIR}\")\n",
+    "print(f\"Single .pt file: {FINAL_PT_PATH if SAVE_SINGLE_PT else 'disabled'}\")\n",
+    "print(f\"Metrics JSON: {METRICS_JSON_PATH}\")\n",
+    "print(f\"Predictions JSONL: {PREDICTIONS_JSONL_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e35a1ca8",
+   "metadata": {},
+   "source": [
+    "## Part 20 - Result inspection\n",
+    "\n",
+    "Check these files after training:\n",
+    "\n",
+    "### 1. `final_metrics.json`\n",
+    "Review the overall metrics.\n",
+    "\n",
+    "### 2. `validation_predictions.jsonl`\n",
+    "Inspect generated answers against the reference answers.\n",
+    "\n",
+    "### 3. `merged_model/`\n",
+    "Use this directory for standard Hugging Face loading.\n",
+    "\n",
+    "### 4. `Qwen3-8B-Instruct_UM_Handbook.pt`\n",
+    "This is the optional single-file export.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91778773",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (TensorCat Py3.10)",
+   "language": "python",
+   "name": "tensorcat-py310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

UM_Handbook/Dataset/Manual_Index/UM_Manual_Core_Question_Index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Draft_Build_Report.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "stage": "baseline_1",
+  "format": "question_answer_only",
+  "inputs": {
+    "index_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/Manual_Index/UM_Manual_Index.json",
+    "chunk_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl"
+  },
+  "outputs": {
+    "metadata_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Metadata_Draft.jsonl",
+    "metadata_pretty_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Metadata_Draft_pretty.json",
+    "training_ready_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Training_Draft.jsonl",
+    "training_ready_pretty_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Training_Draft_pretty.json"
+  },
+  "counts": {
+    "index_rows": 395,
+    "chunk_rows": 521,
+    "metadata_rows": 395,
+    "training_ready_rows": 388,
+    "matched_rows": 388,
+    "unmatched_rows": 7,
+    "filtered_bad_match_rows": 0
+  },
+  "notes": [
+    "This build is for Baseline 1 only.",
+    "Training-ready rows contain only question and answer fields.",
+    "Exact linked_index_id candidates are preferred when available.",
+    "Bad cover/content/heading-only answers are filtered out.",
+    "Vision/Mission/Objectives questions use explicit label-aware extraction when possible."
+  ]
+}

UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft_pretty.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft_pretty.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata_pretty.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready_pretty.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_pretty.json ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_report.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "total_chunks": 521,
+  "scope_distribution": {
+    "general": 58,
+    "postgraduate": 250,
+    "undergraduate": 213
+  },
+  "notes": [
+    "Chunks are generated from the structured markdown files, not directly from raw PDF pages.",
+    "Low-information cover/content/divider chunks are filtered out.",
+    "Chunk pages are preserved from per-page markdown markers when available.",
+    "Linked Manual_Index ids are based on exact section/subsection matches from UM_Manual_Index.json."
+  ]
+}

UM_Handbook/Dataset/markdown/complete_handbook_structured.md ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/Dataset/markdown/general_handbook_structured.md ADDED Viewed

	@@ -0,0 +1,488 @@

+# General Handbook (Structured Markdown)
+## Faculty Objectives :: Faculty Objectives
+- scope_label: general
+- source_doc: General Handbook
+- pages: 9-9
+### Page 9
+VISION
+A global faculty impacting the world
+MISSION
+Propelling computing technology and
+producing world class leaders
+OBJECTIVES
+To sustain an outstanding faculty dedicated to excellence in
+undergraduate and postgraduate teaching, learning and research.
+To contribute towards the development of the nation through the
+production of quality research and publications.
+To provide innovative academic programs that can respond to the
+changing needs of the society.
+To produce quality graduates who are equipped with advanced
+knowledge and skills of computer science and information technology.
+## History of the Faculty :: History Overview
+- scope_label: general
+- source_doc: General Handbook
+- pages: 10-11
+### Page 10
+The provision of computer facilities and services at the Universiti Malaya
+(UM) began soon after the Computer Centre was officially formed in 1965.
+This made the university one of the pioneers in computer usage in Malaysia.
+In December 1969, the Computer Centre took on an additional role of
+teaching and research of computer science and information technology. The
+Computer Centre Board was formed, comprising the Vice-Chancellor (as
+Chairman), the Director of Computer Centre (as Secretary), and a
+representative from each Faculty, Institute, Centre and the University
+Senate.
+In 1974, the Diploma in Computer Science programme was introduced.
+From its inception in the 1974/1975 Session to the 1997/1998 Session, a
+total of 300 students had been awarded the Diploma. The Master of
+Computer Science (MCS) and Doctor of Philosophy (Ph.D.) programme
+were two (2) higher degree programme by research approved by the Senate
+and had been administered by the Computer Centre since 1985. In addition,
+the Computer Centre offered a four (4) years Bachelor of Computer Science
+programme. The first undergraduate enrolment for the 1990/1991 Session
+was 50 students.
+In April 1st, 1993, the University Senate agreed to the formation of the
+Computer Centre Study Board. The Board proposed the establishment of a
+faculty to be called the Faculty of Computer Science and Information
+Technology (FCSIT). The existing Computer Centre was to be annulled and
+replaced by a Computer Services Division which was placed under the
+Chancellery.
+On September 22nd, 1994, the University of Malaya Council agreed to the
+formation of the Faculty of Computer Science and Information Technology
+(FCSIT), and the Computer Services Division. A sum of 4.2 million was
+obtained from the Ministry of Education under the Sixth Malaysian Plan to
+put up a new building for the faculty, with the necessary infrastructure for
+teaching, learning and research. The building was officially declared open by
+the Minister of Education, Dato' Sri Najib Tun Abdul Razak on September
+26th, 1996.
+HISTORY OF THE FACULTY
+### Page 11
+The Bachelor of Information Technology programme started in the
+1996/1997 Session, with an initial intake of 50 students. To accommodate
+an increase student population, an additional building was built which was
+officially opened by Datuk Fong Chan Onn, Deputy Minister of Education on
+September 21st, 1998.
+Since its establishment, the Faculty of Computer Science and Information
+Technology have been led by a number of distinguished persons. The
+following have served as Directors/Dean:
+HISTORY OF THE FACULTY
+1967 – 1973
+1973 – 1975
+1975 – 1978
+1978 – 1982
+1982 – 1990
+1990 – 1992
+1992 – 2000
+2000 – 2002
+2002 – 2004
+2004 – 2005
+2005 – 2006
+2006 – 2007
+2007 – 2009
+2009 – 2010
+2010 – 2011
+2011 – 2014
+2014 – 2017
+2017 – 2019
+2019 - 2021
+2022 –2024
+2024 - 2025
+2025 - Current
+Mr. Ong Yin Fook
+Professor Paul Peach
+Dr. R.K. Pillay
+Dr. Tan Bock Thiam
+Assoc. Prof. Ir. Dr. Mashkuri Yaacob
+Professor Lee Poh Aun
+Professor Ir. Dr. Mashkuri Yaacob
+Assoc. Prof. Dr. Siti Salwah Salim
+Assoc. Prof. Dr. Zainab Awang Ngah
+Professor Ir. Dr. N. Selvanathan
+Assoc. Prof. Dr. Siti Salwah Salim
+Professor Dato' Dr. Ir. Mashkuri Hj. Yaacob
+Professor Dr. Mohd Sapiyan Baba
+Professor Dr. David Ngo Chek Ling
+Professor Dr. Wan Ahmad Tajuddin Wan Abdullah
+Professor Dr. Siti Salwah Salim
+Professor Dr. Abdullah Gani
+Professor Dr. Abrizah Abdullah
+Professor Datin Dr. Sameem Abdul Kareem
+Professor Dr. Loo Chu Kiong
+Professor Ir. Dr. Chan Chee Seng
+Associate Professor Dr. Norisma Idris
+## Academic Calendar 2025/2026 :: Master and Doctorate Level Academic Calendar
+- scope_label: postgraduate
+- source_doc: General Handbook
+- pages: 4-4
+### Page 4
+Lampiran B2
+ACADEMIC CALENDAR 2025/2026 ACADEMIC SESSION
+(MASTER AND DOCTORATE LEVEL)
+AMENDMENT
+‘SEMESTER |
+‘Orientation (Week of Welcome)-WOW 1 week 05.10.2025 = «= 12.10.2025
+Lectures, 6 weeks" 19.10.2028 - 23.11.2025
+Mid Semester | Break 1 week 24.11.2025 = © 30.11.2025
+Lectures. 8 weeks" 01.12.2025 = �� 25.01.2026
+Revision Week 1 weeks" 26.01.2028 © - 01.02.2026
+‘Semester | Final Examination 3 weeks" 02.02.2026 - © 2202.2026
+‘Semester | Break 2 week 23.02.2028 - 08.03.2026
+22 weeks
+‘SEMESTER I
+Lectures: 7 weeks" 09.03.2026 - 26.04.2026
+‘Mid Semester II Break 1 week 27.04.2028 © - 03.05.2026
+Lectures, 7 weeks* 04052026 - 21.06.2026
+Revision Week 1 week* 22.06.2026 - 28.06.2026
+‘Semester II Final Examination 3 weeks" 29.06.2026 - 19.07.2026
+‘Semester Il Break 4 _weeks 20.07.2028 - 16.08.2026
+23 weeks
+‘SPECIAL SEMESTER
+Lectures: 7 weeks" 27.07.2026 - ~— 13.09.2026
+‘Special Semester Final Examination 1 week* 14092028 © - 20.09.2026
+Special Semester Break 1_week 21.09.2026 28.09.2026
+9 weeks
+Notes:
+(1) The Module Registration and Examination Schedule can be referred to at tips /umsitsquide um edu my. All,
+information is subject to change.
+(0) The Academic Calendar has taken into account public and festive holidays and is subject to change
+Deepavali 20 October 2025 (Monday)
+CChvistnas Day 2 December 2025 (Thursday)
+New Year 04 Janvary 2026 (Thursday)
+‘Thaipusam 01 February 2026 (Sunday)
+Federal Tertitory Day 01 February 2026 (Sunday)
+‘Chinese New Year 17 & 18 Fetruary 2026 (Tuesday & Wednesday)
+Nuzul Al-Quran, 07 March 2026 (Saturday)
+Edu Fn 20.21 March 2026 (Friday & Saturday)
+Labour Day 1 May 2026 (Friday)
+idl Adhal 27 May 2026 (Wednesday)
+Wesak Day 34 May 2026 (Sunday)
+His Majesty the King's Birthday (1 June 2026 (Monday)
+‘Awal Muharram 16 June 2028 (Tuesday)
+Prophet Munammads (Mauidur Rasu) 26 August 2026 (Tuesday)
+atonal Day 31 August 2026 (Monday)
+Malaysia Day 16 September 2026 (Wednesday)
+(DUM PG est 2026 (09.11 June 2026 (Tuesday - Thursday)
+*Senate August 28, 2025
+## Academic Calendar 2025/2026 :: Bachelor Degree Level Academic Calendar
+- scope_label: undergraduate
+- source_doc: General Handbook
+- pages: 12-12
+### Page 12
+Ey
+ACADEMIC CALENDAR et
+SESSION 2025/2026 A
+eee
+ACADEMIC CALENDAR 2025/2026 ACADEMIC SESSION
+(BACHELOR DEGREE LEVEL)
+AMENDMENT
+SEMESTER |
+Orientation (Week of Welcome)-WOW 1 week 05.10.2025 - 12.10.2025
+Lectures 6 weeks* 13.10.2025 - = 23.11.2025
+Mid Semester | Break 1 week 24.11.2025 - 30.11.2025
+Lectures, 8 weeks* 01.12.2025 - — 26.01.2026
+Revision Week 1 weeks* 26.01.2026 - 01.02.2026
+‘Semester | Final Examination 3 weeks* 02.02.2026 - 22.02.2026
+‘Semester | Break 2 week 23.02.2026 - 08.03.2026
+22 weeks
+‘SEMESTER I
+Lectures 7 weeks* 09.03.2026 - 26.04.2026
+Mid Semester II Break 1 week 27.04.2026 - 03.05.2026
+Lectures. 7 weeks* 04.05.2026 - 21.06.2026
+Revision Week 1 week" 22.06.2026 - 28.06.2026
+‘Semester II Final Examination 3 weeks* 29.06.2026 - 19.07.2026
+‘Semester II Break 4_weeks 20.07.2026 - 16.08.2026
+23 weeks
+‘SPECIAL SEMESTER
+Lectures: 7 weeks* 27.07.2026 - — 13.09.2026
+‘Special Semester Final Examination 1 week" 14.09.2026 - 20.09.2026
+‘Special Semester Break 1_week 21.09.2026 28.09.2026
+9 weeks
+Notes:
+(1) The Module Registration and Examination Schedule can be referred to at https://umsitsquide um edumy, All
+information is subject to change.
+(() The Academic Calendar has taken into account public and festive holidays and is subject to change’
+Deepavali 20 October 2028 (Monday)
+‘Christmas Day 25 December 2025 (Thursday)
+New Year 01 January 2026 (Thursday)
+‘Thaipusam 01 February 2026 (Sunday)
+Federal Tertory Day (01 February 2026 (Sunday)
+Chinese New Year 17 & 18 February 2026 (Tuesday & Wednesday)
+‘Nuzul Al-Quran 07 March 2026 (Saturday)
+Eidul Ft 20 & 21 March 2026 (Friday & Saturday)
+Labour Day, 01 May 2026 (Friday)
+Eidul Adna (27 May 2026 (Wednesday)
+Wesak Day 31 May 2026 (Sunday)
+His Majesty the King's Birthday 1 June 2026 (Monday)
+‘Awal Muharram 16 June 2026 (Tuesday)
+Prophet Muhammad's (Maulidur Rasul) 25 August 2026 (Tuesday)
+‘National Day 31 August 2026 (Monday)
+Malaysia Day 16 September 2026 (Wednesday)
+() UM UG Fest 2026 (09-11 June 2026 (Tuesday - Thursday)
+*Senate August 28, 2025
+## Teaching and Learning Facilities :: Teaching Labs
+- scope_label: general
+- source_doc: General Handbook
+- pages: 13-14
+### Page 13
+(A) TEACHING LABS
+The Faculty of Computer Science and Information Technology provide 9 laboratories for
+teaching and learning purposes. The laboratories are as follows:
+BLOCK A
+Micro Lab 1 (MM1)
+This lab has 50 units of computer that are connected to Windows Active Directory servers and
+the Internet. The operating system for these PCs is Windows 10. This lab is opened to all
+FSKTM undergraduate students.
+Micro Lab 2 (MM2)
+This lab has 12 units of computer that are connected to Windows Active Directory servers and
+the Internet. The operating system for these PCs is Windows 10. This lab is opened to all
+FSKTM undergraduate students.
+Postgraduate Lab (ML)
+This lab has 33 units of computer. All the computers are connected to Windows Active Directory
+servers and the Internet. The operating system for these PCs is Windows 10. This lab is opened
+to all FSKTM postgraduate students.
+CCNA LAB (CCNA)
+This lab has 41 units of computer. The operating system for these workstations is Windows 10.
+There are also 25 units of Cisco 1700 Series Router, 4 units Cisco 1760 Series Router and 12
+units switch Cisco 2950 CATALYST Series. This lab is opened to all FSKTM students.
+Robotic Teaching Lab
+The Robotic Teaching Lab @ FCSIT is part of the Department of Artificial Intelligence effort to
+provide conducive intelligent learning environment to students taking the 'Intelligent Robotics'
+course. Equipped with six mobile robots, the lab allows space for hands-on and robotic
+experiments designed to help students understand the concept of robotic intelligence and
+acquire the needful skills for the course.
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+### Page 14
+BLOCK B
+Micro Lab 3 (MM3)
+This lab has 60 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to undergraduate and postgraduate students.
+Micro Lab 4 (MM4)
+This lab has 60 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to undergraduate and postgraduate students.
+Micro Lab 6 (MM6)
+This lab has 45 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to all FSKTM students but priority is given to multimedia courses.
+Operating system – Windows 10.
+Stroustrup Lab 1
+This lab has 42 units of computer that are connected to the Internet. This lab is opened to
+undergraduate students. Operating system – Windows 10.
+(B) RESEARCH LABS
+29 research labs to support postgraduate students research activities, managed by various
+departments in the faculty:
+BLOCK A
+Computer Technology Lab
+This lab is opened to post-graduate student, priority given to students who are taking courses
+related to the field Computer Technology.
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+## Teaching and Learning Facilities :: Research Labs
+- scope_label: general
+- source_doc: General Handbook
+- pages: 14-16
+### Page 14
+BLOCK B
+Micro Lab 3 (MM3)
+This lab has 60 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to undergraduate and postgraduate students.
+Micro Lab 4 (MM4)
+This lab has 60 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to undergraduate and postgraduate students.
+Micro Lab 6 (MM6)
+This lab has 45 units of computer that are connected to Windows Active Directory servers and
+the Internet. This lab is opened to all FSKTM students but priority is given to multimedia courses.
+Operating system – Windows 10.
+Stroustrup Lab 1
+This lab has 42 units of computer that are connected to the Internet. This lab is opened to
+undergraduate students. Operating system – Windows 10.
+(B) RESEARCH LABS
+29 research labs to support postgraduate students research activities, managed by various
+departments in the faculty:
+BLOCK A
+Computer Technology Lab
+This lab is opened to post-graduate student, priority given to students who are taking courses
+related to the field Computer Technology.
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+### Page 15
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+BLOCK B
+Artificial Intelligence Research Lab
+Qualitative reasoning, qualitative modeling, Intelligent Tutoring System, Case-based System,
+Intelligent Interactive Multimedia System.
+VLSI Research Lab
+The study of the performance and the implementation of fast pipelined floating-point arithmetic
+circuits and arithmetic algorithm, as well as on designing VLSI. Focus is given to the aspect of
+VLSI circuits test.
+Computer Systems and Network Research Lab
+Focus on data security research through networking, ability of protocols and ATM studies.
+Multimedia Research Lab
+Research and development comprise:
+Corporate training
+Smart school education software
+Distributed multimedia systems
+Web-based multimedia systems
+Multimedia Storage & retrieval technology
+Multimedia input & output technology
+Human Computer Interaction (HCI) Research Lab
+This lab used is for conducting research on usability area, computer support cooperative work
+(CSCW) and task analysis. It involves task analysis hierarchy chart for user understandability test
+in implementing any task.
+### Page 16
+Information System Research Lab
+This lab is used for conducting research on dissimilar information systems integration in
+heterogeneous environment including operating system, hardware, language and the use of the
+latest software industrial standard to integrate information systems.
+Research and development on:
+Business Oriented Systems/ Electronic Government Systems
+Geographic Information Systems
+Inter-organizational Information Systems
+Web-based Information Systems
+Smart Card Application
+Stroustrup Lab 2
+This lab has 18 units of computer that are connected to the Internet. This lab is opened to
+undergraduate students taking courses related to electronic circuit.
+(C) PROJECT BASED LAB
+Artificial Intelligence 4 U (AI4U)
+AI-based Machine Vision essentials. Key objective is to transfer ‘AI-based machine
+vision’ knowledge to university lecturers and students.
+Wisma R&D (15th floor):
+Web Based Information System Lab & Knowledge Engineering Lab (Open-Space
+Concept)
+Both the Knowledge Engineering Lab and the Web-Based Information System Lab are open-
+space concept labs accessible to all postgraduate students, regardless of their field. Students
+can use the space and facilities provided in these labs, with permission for access. These labs
+are located on the 15th floor of Wisma R&D
+Robotedge AI Robotic Lab
+This lab is previously known as Natural Language Processing Lab. This lab is equipped with
+equipment for AI robotics research and development focusing on environmental, home services,
+and search and rescue research areas.
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+## Teaching and Learning Facilities :: Project Based Labs
+- scope_label: general
+- source_doc: General Handbook
+- pages: 16-16
+### Page 16
+Information System Research Lab
+This lab is used for conducting research on dissimilar information systems integration in
+heterogeneous environment including operating system, hardware, language and the use of the
+latest software industrial standard to integrate information systems.
+Research and development on:
+Business Oriented Systems/ Electronic Government Systems
+Geographic Information Systems
+Inter-organizational Information Systems
+Web-based Information Systems
+Smart Card Application
+Stroustrup Lab 2
+This lab has 18 units of computer that are connected to the Internet. This lab is opened to
+undergraduate students taking courses related to electronic circuit.
+(C) PROJECT BASED LAB
+Artificial Intelligence 4 U (AI4U)
+AI-based Machine Vision essentials. Key objective is to transfer ‘AI-based machine
+vision’ knowledge to university lecturers and students.
+Wisma R&D (15th floor):
+Web Based Information System Lab & Knowledge Engineering Lab (Open-Space
+Concept)
+Both the Knowledge Engineering Lab and the Web-Based Information System Lab are open-
+space concept labs accessible to all postgraduate students, regardless of their field. Students
+can use the space and facilities provided in these labs, with permission for access. These labs
+are located on the 15th floor of Wisma R&D
+Robotedge AI Robotic Lab
+This lab is previously known as Natural Language Processing Lab. This lab is equipped with
+equipment for AI robotics research and development focusing on environmental, home services,
+and search and rescue research areas.
+TEACHING AND LEARNING FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+## Other Facilities :: Student Support and Campus Facilities
+- scope_label: general
+- source_doc: General Handbook
+- pages: 17-17
+### Page 17
+Prayer Room (surau)
+Air-conditioned prayer rooms (surau) (one for Men, and the other for Women) are provided in Block A for
+Muslims to pray. The surau for Men is located at the second floor and surau for women is located at the first
+floor in the building. Users are not allowed to sleep and eat in the surau. Users are also responsible for the
+cleanliness of the surau.
+Vending Machine (Drinks)
+There are 4 units of vending machine for cold drinks located at Block A and Block B.
+Cafeteria
+Cafeteria is located at the back of Block A.
+Postgraduate Lounge & Student Centre
+Space provided for student to relaxing their mind, having informal discussion and make a small gathering. A few
+facilities such as sofas, computers, discussion rooms and pantry are ready to use.
+Parking Lot
+The Faculty also provides parking lots for students to park their car or motorbike. Students can park their car or
+motorbike at the back of Block A. There are 150 parking lots for the motorbike and 45 for the car. Students are
+not allowed to park their car in front of both buildings because the parking lots are reserved for the faculty staff
+and visitors.
+Water Purifiers
+Water purifiers are provided in both buildings and placed at every floor.
+Internet Access at the building of FCSIT
+There are WIFI Internet Access provided to students at every floor in each building. Students must obey the
+rules and regulations during the usage of these facilities.
+SPeCTRUM (Student Powered e-Collaboration Transforming UM)
+This facility is for easy accessibility for student to upload their notes and information regarding their courses.
+All faculties (excluding Faculty of Medicine & Faculty of Dentistry) and PASUM can browse the SPECTRUM
+website at https://spectrum.um.edu.my/
+For Faculty of Medicine and Faculty of Dentistry, SPECTRUM website can be browsed at
+https://spectrumx.um.edu.my/
+All queries and suggestions can be directed to https://helpdesk.um.edu.my/
+Door Access
+Students must register for door access for using research labs, Student Center and Postgraduate Lounge.
+OTHER FACILITIES
+FACULTY OF COMPUTER SCIENCE AND
+INFORMATION TECHNOLOGY
+1.
+2.
+3.
+4.
+5.
+6.
+7.
+8.
+9.

UM_Handbook/Dataset/pdf/Complete Handbook.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19204e4c67cb49051387ad558bed58a302ea332f66ae82e297ffb19347ca5455
+size 35690631

UM_Handbook/Dataset/pdf/General Handbook.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e68a06c7da277fbe1256885f7062c4c91d3bb8bbcd341a583da45a5119b0ec
+size 3286027

UM_Handbook/Dataset/reports/um_handbook_markdown_report.json ADDED Viewed

	@@ -0,0 +1,2771 @@

+{
+  "generated_files": {
+    "general_markdown": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/markdown/general_handbook_structured.md",
+    "complete_markdown": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/markdown/complete_handbook_structured.md"
+  },
+  "general_block_count": 8,
+  "complete_block_count": 75,
+  "general_blocks": [
+    {
+      "section": "Faculty Objectives",
+      "subsection": "Faculty Objectives",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        9,
+        9
+      ],
+      "page_stats": [
+        {
+          "page": 9,
+          "source": "native",
+          "chars": 590,
+          "seconds": 0.03
+        }
+      ],
+      "total_chars": 601,
+      "seconds": 0.03
+    },
+    {
+      "section": "History of the Faculty",
+      "subsection": "History Overview",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        10,
+        11
+      ],
+      "page_stats": [
+        {
+          "page": 10,
+          "source": "native",
+          "chars": 2073,
+          "seconds": 0.02
+        },
+        {
+          "page": 11,
+          "source": "native",
+          "chars": 1467,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 3566,
+      "seconds": 0.02
+    },
+    {
+      "section": "Academic Calendar 2025/2026",
+      "subsection": "Master and Doctorate Level Academic Calendar",
+      "scope_label": "postgraduate",
+      "source_doc": "General Handbook",
+      "pages": [
+        4,
+        4
+      ],
+      "page_stats": [
+        {
+          "page": 4,
+          "source": "ocr",
+          "chars": 2004,
+          "seconds": 1.56
+        }
+      ],
+      "total_chars": 2015,
+      "seconds": 1.56
+    },
+    {
+      "section": "Academic Calendar 2025/2026",
+      "subsection": "Bachelor Degree Level Academic Calendar",
+      "scope_label": "undergraduate",
+      "source_doc": "General Handbook",
+      "pages": [
+        12,
+        12
+      ],
+      "page_stats": [
+        {
+          "page": 12,
+          "source": "ocr",
+          "chars": 2038,
+          "seconds": 1.58
+        }
+      ],
+      "total_chars": 2050,
+      "seconds": 1.58
+    },
+    {
+      "section": "Teaching and Learning Facilities",
+      "subsection": "Teaching Labs",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        13,
+        14
+      ],
+      "page_stats": [
+        {
+          "page": 13,
+          "source": "native",
+          "chars": 1693,
+          "seconds": 0.01
+        },
+        {
+          "page": 14,
+          "source": "native",
+          "chars": 1182,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 2901,
+      "seconds": 0.02
+    },
+    {
+      "section": "Teaching and Learning Facilities",
+      "subsection": "Research Labs",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        14,
+        16
+      ],
+      "page_stats": [
+        {
+          "page": 14,
+          "source": "native",
+          "chars": 1182,
+          "seconds": 0.0
+        },
+        {
+          "page": 15,
+          "source": "native",
+          "chars": 1146,
+          "seconds": 0.01
+        },
+        {
+          "page": 16,
+          "source": "native",
+          "chars": 1630,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 3998,
+      "seconds": 0.02
+    },
+    {
+      "section": "Teaching and Learning Facilities",
+      "subsection": "Project Based Labs",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        16,
+        16
+      ],
+      "page_stats": [
+        {
+          "page": 16,
+          "source": "native",
+          "chars": 1630,
+          "seconds": 0.0
+        }
+      ],
+      "total_chars": 1642,
+      "seconds": 0.01
+    },
+    {
+      "section": "Other Facilities",
+      "subsection": "Student Support and Campus Facilities",
+      "scope_label": "general",
+      "source_doc": "General Handbook",
+      "pages": [
+        17,
+        17
+      ],
+      "page_stats": [
+        {
+          "page": 17,
+          "source": "native",
+          "chars": 2156,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 2168,
+      "seconds": 0.01
+    }
+  ],
+  "complete_blocks": [
+    {
+      "section": "Postgraduate Faculty Identity",
+      "subsection": "Vision and Mission",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        186,
+        186
+      ],
+      "page_stats": [
+        {
+          "page": 186,
+          "source": "native",
+          "chars": 714,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 727,
+      "seconds": 0.01
+    },
+    {
+      "section": "Faculty Staff",
+      "subsection": "Dean's Office and Management",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        6,
+        8
+      ],
+      "page_stats": [
+        {
+          "page": 6,
+          "source": "ocr",
+          "chars": 521,
+          "seconds": 0.62
+        },
+        {
+          "page": 7,
+          "source": "ocr",
+          "chars": 772,
+          "seconds": 0.75
+        },
+        {
+          "page": 8,
+          "source": "ocr",
+          "chars": 443,
+          "seconds": 0.4
+        }
+      ],
+      "total_chars": 1773,
+      "seconds": 1.77
+    },
+    {
+      "section": "Faculty Staff",
+      "subsection": "Department of Artificial Intelligence",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        9,
+        12
+      ],
+      "page_stats": [
+        {
+          "page": 9,
+          "source": "ocr",
+          "chars": 1438,
+          "seconds": 1.0
+        },
+        {
+          "page": 10,
+          "source": "ocr",
+          "chars": 1620,
+          "seconds": 1.3
+        },
+        {
+          "page": 11,
+          "source": "ocr",
+          "chars": 1392,
+          "seconds": 1.17
+        },
+        {
+          "page": 12,
+          "source": "ocr",
+          "chars": 1388,
+          "seconds": 1.25
+        }
+      ],
+      "total_chars": 5891,
+      "seconds": 4.72
+    },
+    {
+      "section": "Faculty Staff",
+      "subsection": "Department of Software Engineering",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        13,
+        16
+      ],
+      "page_stats": [
+        {
+          "page": 13,
+          "source": "ocr",
+          "chars": 1822,
+          "seconds": 1.39
+        },
+        {
+          "page": 14,
+          "source": "ocr",
+          "chars": 1655,
+          "seconds": 1.36
+        },
+        {
+          "page": 15,
+          "source": "ocr",
+          "chars": 1857,
+          "seconds": 1.41
+        },
+        {
+          "page": 16,
+          "source": "ocr",
+          "chars": 1550,
+          "seconds": 1.72
+        }
+      ],
+      "total_chars": 6938,
+      "seconds": 5.88
+    },
+    {
+      "section": "Faculty Staff",
+      "subsection": "Department of Information Systems",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        17,
+        20
+      ],
+      "page_stats": [
+        {
+          "page": 17,
+          "source": "ocr",
+          "chars": 1455,
+          "seconds": 2.99
+        },
+        {
+          "page": 18,
+          "source": "ocr",
+          "chars": 1576,
+          "seconds": 1.13
+        },
+        {
+          "page": 19,
+          "source": "ocr",
+          "chars": 1466,
+          "seconds": 0.94
+        },
+        {
+          "page": 20,
+          "source": "ocr",
+          "chars": 269,
+          "seconds": 0.46
+        }
+      ],
+      "total_chars": 4820,
+      "seconds": 5.51
+    },
+    {
+      "section": "Postgraduate General Information",
+      "subsection": "Legislation and Prescribed Rules",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        126,
+        126
+      ],
+      "page_stats": [
+        {
+          "page": 126,
+          "source": "ocr",
+          "chars": 979,
+          "seconds": 0.8
+        }
+      ],
+      "total_chars": 992,
+      "seconds": 0.8
+    },
+    {
+      "section": "Postgraduate General Information",
+      "subsection": "Marking Scheme and Grade Point Average (GPA)",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        127,
+        127
+      ],
+      "page_stats": [
+        {
+          "page": 127,
+          "source": "ocr",
+          "chars": 446,
+          "seconds": 0.74
+        }
+      ],
+      "total_chars": 459,
+      "seconds": 0.74
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Progress Report",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        129,
+        129
+      ],
+      "page_stats": [
+        {
+          "page": 129,
+          "source": "ocr",
+          "chars": 640,
+          "seconds": 0.58
+        }
+      ],
+      "total_chars": 653,
+      "seconds": 0.58
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Supervision Policy for Postgraduate Programmes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        130,
+        137
+      ],
+      "page_stats": [
+        {
+          "page": 130,
+          "source": "ocr",
+          "chars": 1555,
+          "seconds": 1.4
+        },
+        {
+          "page": 131,
+          "source": "ocr",
+          "chars": 2554,
+          "seconds": 1.85
+        },
+        {
+          "page": 132,
+          "source": "ocr",
+          "chars": 1256,
+          "seconds": 1.05
+        },
+        {
+          "page": 133,
+          "source": "ocr",
+          "chars": 347,
+          "seconds": 0.49
+        },
+        {
+          "page": 134,
+          "source": "ocr",
+          "chars": 2299,
+          "seconds": 1.63
+        },
+        {
+          "page": 135,
+          "source": "ocr",
+          "chars": 2336,
+          "seconds": 1.78
+        },
+        {
+          "page": 136,
+          "source": "ocr",
+          "chars": 1592,
+          "seconds": 1.22
+        },
+        {
+          "page": 137,
+          "source": "ocr",
+          "chars": 587,
+          "seconds": 0.85
+        }
+      ],
+      "total_chars": 12644,
+      "seconds": 10.26
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Thesis Preparation Guidelines",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        138,
+        171
+      ],
+      "page_stats": [
+        {
+          "page": 138,
+          "source": "ocr",
+          "chars": 1525,
+          "seconds": 1.2
+        },
+        {
+          "page": 139,
+          "source": "ocr",
+          "chars": 1389,
+          "seconds": 1.03
+        },
+        {
+          "page": 140,
+          "source": "ocr",
+          "chars": 2853,
+          "seconds": 1.68
+        },
+        {
+          "page": 141,
+          "source": "ocr",
+          "chars": 778,
+          "seconds": 0.71
+        },
+        {
+          "page": 142,
+          "source": "ocr",
+          "chars": 2585,
+          "seconds": 1.66
+        },
+        {
+          "page": 143,
+          "source": "ocr",
+          "chars": 2168,
+          "seconds": 2.02
+        },
+        {
+          "page": 144,
+          "source": "ocr",
+          "chars": 1149,
+          "seconds": 1.0
+        },
+        {
+          "page": 145,
+          "source": "ocr",
+          "chars": 705,
+          "seconds": 0.85
+        },
+        {
+          "page": 146,
+          "source": "ocr",
+          "chars": 1735,
+          "seconds": 1.26
+        },
+        {
+          "page": 147,
+          "source": "ocr",
+          "chars": 1045,
+          "seconds": 0.8
+        },
+        {
+          "page": 148,
+          "source": "ocr",
+          "chars": 750,
+          "seconds": 0.73
+        },
+        {
+          "page": 149,
+          "source": "ocr",
+          "chars": 1195,
+          "seconds": 1.39
+        },
+        {
+          "page": 150,
+          "source": "ocr",
+          "chars": 95,
+          "seconds": 0.47
+        },
+        {
+          "page": 151,
+          "source": "ocr",
+          "chars": 1446,
+          "seconds": 1.0
+        },
+        {
+          "page": 152,
+          "source": "ocr",
+          "chars": 2698,
+          "seconds": 1.78
+        },
+        {
+          "page": 153,
+          "source": "ocr",
+          "chars": 1588,
+          "seconds": 1.54
+        },
+        {
+          "page": 154,
+          "source": "ocr",
+          "chars": 2154,
+          "seconds": 1.46
+        },
+        {
+          "page": 155,
+          "source": "ocr",
+          "chars": 2084,
+          "seconds": 1.37
+        },
+        {
+          "page": 156,
+          "source": "ocr",
+          "chars": 1279,
+          "seconds": 0.94
+        },
+        {
+          "page": 157,
+          "source": "ocr",
+          "chars": 1991,
+          "seconds": 1.37
+        },
+        {
+          "page": 158,
+          "source": "ocr",
+          "chars": 2460,
+          "seconds": 1.64
+        },
+        {
+          "page": 159,
+          "source": "ocr",
+          "chars": 1064,
+          "seconds": 1.05
+        },
+        {
+          "page": 160,
+          "source": "ocr",
+          "chars": 1916,
+          "seconds": 1.25
+        },
+        {
+          "page": 161,
+          "source": "ocr",
+          "chars": 1922,
+          "seconds": 1.28
+        },
+        {
+          "page": 162,
+          "source": "ocr",
+          "chars": 1451,
+          "seconds": 1.06
+        },
+        {
+          "page": 163,
+          "source": "ocr",
+          "chars": 984,
+          "seconds": 0.74
+        },
+        {
+          "page": 164,
+          "source": "ocr",
+          "chars": 189,
+          "seconds": 0.47
+        },
+        {
+          "page": 165,
+          "source": "ocr",
+          "chars": 519,
+          "seconds": 0.82
+        },
+        {
+          "page": 166,
+          "source": "ocr",
+          "chars": 343,
+          "seconds": 1.04
+        },
+        {
+          "page": 167,
+          "source": "ocr",
+          "chars": 1223,
+          "seconds": 1.89
+        },
+        {
+          "page": 168,
+          "source": "ocr",
+          "chars": 1905,
+          "seconds": 1.21
+        },
+        {
+          "page": 169,
+          "source": "ocr",
+          "chars": 2290,
+          "seconds": 2.18
+        },
+        {
+          "page": 170,
+          "source": "ocr",
+          "chars": 477,
+          "seconds": 0.47
+        },
+        {
+          "page": 171,
+          "source": "ocr",
+          "chars": 1465,
+          "seconds": 1.03
+        }
+      ],
+      "total_chars": 49928,
+      "seconds": 40.36
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Thesis or Dissertation Submission and Examinations",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        172,
+        172
+      ],
+      "page_stats": [
+        {
+          "page": 172,
+          "source": "native_filtered",
+          "chars": 0,
+          "seconds": 0.33
+        }
+      ],
+      "total_chars": 0,
+      "seconds": 0.33
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Publication Requirement",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        173,
+        175
+      ],
+      "page_stats": [
+        {
+          "page": 173,
+          "source": "ocr",
+          "chars": 1761,
+          "seconds": 1.73
+        },
+        {
+          "page": 174,
+          "source": "ocr",
+          "chars": 1759,
+          "seconds": 1.37
+        },
+        {
+          "page": 175,
+          "source": "ocr",
+          "chars": 2295,
+          "seconds": 1.58
+        }
+      ],
+      "total_chars": 5858,
+      "seconds": 4.68
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Plagiarism",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        176,
+        176
+      ],
+      "page_stats": [
+        {
+          "page": 176,
+          "source": "ocr",
+          "chars": 620,
+          "seconds": 0.87
+        }
+      ],
+      "total_chars": 633,
+      "seconds": 0.87
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Intellectual Property",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        177,
+        177
+      ],
+      "page_stats": [
+        {
+          "page": 177,
+          "source": "ocr",
+          "chars": 587,
+          "seconds": 0.71
+        }
+      ],
+      "total_chars": 600,
+      "seconds": 0.71
+    },
+    {
+      "section": "Research Guidance",
+      "subsection": "Postgraduate Activities",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        178,
+        181
+      ],
+      "page_stats": [
+        {
+          "page": 178,
+          "source": "native_filtered",
+          "chars": 0,
+          "seconds": 0.85
+        },
+        {
+          "page": 179,
+          "source": "ocr_filtered",
+          "chars": 0,
+          "seconds": 0.53
+        },
+        {
+          "page": 180,
+          "source": "ocr",
+          "chars": 95,
+          "seconds": 0.57
+        },
+        {
+          "page": 181,
+          "source": "ocr_filtered",
+          "chars": 0,
+          "seconds": 0.61
+        }
+      ],
+      "total_chars": 108,
+      "seconds": 2.56
+    },
+    {
+      "section": "Laboratory Regulations and Support",
+      "subsection": "Laboratory Regulations",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        183,
+        183
+      ],
+      "page_stats": [
+        {
+          "page": 183,
+          "source": "ocr",
+          "chars": 1950,
+          "seconds": 1.42
+        }
+      ],
+      "total_chars": 1963,
+      "seconds": 1.42
+    },
+    {
+      "section": "Laboratory Regulations and Support",
+      "subsection": "Technical Problem Enquiries",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        184,
+        184
+      ],
+      "page_stats": [
+        {
+          "page": 184,
+          "source": "ocr",
+          "chars": 791,
+          "seconds": 0.83
+        }
+      ],
+      "total_chars": 804,
+      "seconds": 0.83
+    },
+    {
+      "section": "Undergraduate Faculty Identity",
+      "subsection": "Vision and Mission",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        187,
+        187
+      ],
+      "page_stats": [
+        {
+          "page": 187,
+          "source": "native",
+          "chars": 590,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 603,
+      "seconds": 0.01
+    },
+    {
+      "section": "Faculty Staff",
+      "subsection": "Undergraduate Dean's Office and Department Leadership",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        192,
+        199
+      ],
+      "page_stats": [
+        {
+          "page": 192,
+          "source": "native",
+          "chars": 1223,
+          "seconds": 0.01
+        },
+        {
+          "page": 193,
+          "source": "native",
+          "chars": 1100,
+          "seconds": 0.01
+        },
+        {
+          "page": 194,
+          "source": "native",
+          "chars": 403,
+          "seconds": 0.0
+        },
+        {
+          "page": 195,
+          "source": "native",
+          "chars": 1118,
+          "seconds": 0.0
+        },
+        {
+          "page": 196,
+          "source": "native",
+          "chars": 1263,
+          "seconds": 0.01
+        },
+        {
+          "page": 197,
+          "source": "native",
+          "chars": 1493,
+          "seconds": 0.01
+        },
+        {
+          "page": 198,
+          "source": "native",
+          "chars": 799,
+          "seconds": 0.0
+        },
+        {
+          "page": 199,
+          "source": "native",
+          "chars": 953,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 8470,
+      "seconds": 0.04
+    },
+    {
+      "section": "Undergraduate Programmes",
+      "subsection": "Programmes Offered",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        200,
+        200
+      ],
+      "page_stats": [
+        {
+          "page": 200,
+          "source": "native",
+          "chars": 419,
+          "seconds": 0.0
+        }
+      ],
+      "total_chars": 432,
+      "seconds": 0.0
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "University Courses",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        225,
+        227
+      ],
+      "page_stats": [
+        {
+          "page": 225,
+          "source": "ocr_filtered",
+          "chars": 0,
+          "seconds": 0.3
+        },
+        {
+          "page": 226,
+          "source": "ocr",
+          "chars": 3602,
+          "seconds": 2.23
+        },
+        {
+          "page": 227,
+          "source": "ocr",
+          "chars": 383,
+          "seconds": 0.47
+        }
+      ],
+      "total_chars": 4013,
+      "seconds": 3.0
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Faculty Core Courses",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        228,
+        230
+      ],
+      "page_stats": [
+        {
+          "page": 228,
+          "source": "ocr_filtered",
+          "chars": 0,
+          "seconds": 0.33
+        },
+        {
+          "page": 229,
+          "source": "ocr",
+          "chars": 3495,
+          "seconds": 2.26
+        },
+        {
+          "page": 230,
+          "source": "ocr",
+          "chars": 940,
+          "seconds": 0.74
+        }
+      ],
+      "total_chars": 4463,
+      "seconds": 3.34
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Programme Core Courses",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        231,
+        239
+      ],
+      "page_stats": [
+        {
+          "page": 231,
+          "source": "native",
+          "chars": 354,
+          "seconds": 0.0
+        },
+        {
+          "page": 232,
+          "source": "ocr",
+          "chars": 3401,
+          "seconds": 2.1
+        },
+        {
+          "page": 233,
+          "source": "ocr",
+          "chars": 3656,
+          "seconds": 3.14
+        },
+        {
+          "page": 234,
+          "source": "ocr",
+          "chars": 3204,
+          "seconds": 2.7
+        },
+        {
+          "page": 235,
+          "source": "ocr",
+          "chars": 3661,
+          "seconds": 2.85
+        },
+        {
+          "page": 236,
+          "source": "ocr",
+          "chars": 3604,
+          "seconds": 2.67
+        },
+        {
+          "page": 237,
+          "source": "ocr",
+          "chars": 3105,
+          "seconds": 2.63
+        },
+        {
+          "page": 238,
+          "source": "ocr",
+          "chars": 3197,
+          "seconds": 2.38
+        },
+        {
+          "page": 239,
+          "source": "ocr",
+          "chars": 2244,
+          "seconds": 2.0
+        }
+      ],
+      "total_chars": 26559,
+      "seconds": 20.48
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Computer System and Network",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        240,
+        244
+      ],
+      "page_stats": [
+        {
+          "page": 240,
+          "source": "ocr",
+          "chars": 87,
+          "seconds": 0.56
+        },
+        {
+          "page": 241,
+          "source": "ocr",
+          "chars": 3637,
+          "seconds": 2.72
+        },
+        {
+          "page": 242,
+          "source": "ocr",
+          "chars": 3475,
+          "seconds": 2.42
+        },
+        {
+          "page": 243,
+          "source": "ocr",
+          "chars": 2926,
+          "seconds": 2.62
+        },
+        {
+          "page": 244,
+          "source": "ocr",
+          "chars": 2290,
+          "seconds": 1.72
+        }
+      ],
+      "total_chars": 12488,
+      "seconds": 10.04
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Artificial Intelligence",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        245,
+        249
+      ],
+      "page_stats": [
+        {
+          "page": 245,
+          "source": "ocr",
+          "chars": 86,
+          "seconds": 0.53
+        },
+        {
+          "page": 246,
+          "source": "ocr",
+          "chars": 3239,
+          "seconds": 2.28
+        },
+        {
+          "page": 247,
+          "source": "ocr",
+          "chars": 3267,
+          "seconds": 2.35
+        },
+        {
+          "page": 248,
+          "source": "ocr",
+          "chars": 3545,
+          "seconds": 2.23
+        },
+        {
+          "page": 249,
+          "source": "ocr",
+          "chars": 2485,
+          "seconds": 2.34
+        }
+      ],
+      "total_chars": 12695,
+      "seconds": 9.73
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Information Systems",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        250,
+        254
+      ],
+      "page_stats": [
+        {
+          "page": 250,
+          "source": "ocr",
+          "chars": 78,
+          "seconds": 0.62
+        },
+        {
+          "page": 251,
+          "source": "ocr",
+          "chars": 3833,
+          "seconds": 2.52
+        },
+        {
+          "page": 252,
+          "source": "ocr",
+          "chars": 3880,
+          "seconds": 2.87
+        },
+        {
+          "page": 253,
+          "source": "ocr",
+          "chars": 3654,
+          "seconds": 2.41
+        },
+        {
+          "page": 254,
+          "source": "ocr",
+          "chars": 1627,
+          "seconds": 1.21
+        }
+      ],
+      "total_chars": 13145,
+      "seconds": 9.63
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Software Engineering",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        255,
+        259
+      ],
+      "page_stats": [
+        {
+          "page": 255,
+          "source": "ocr",
+          "chars": 76,
+          "seconds": 0.44
+        },
+        {
+          "page": 256,
+          "source": "ocr",
+          "chars": 3329,
+          "seconds": 3.96
+        },
+        {
+          "page": 257,
+          "source": "ocr",
+          "chars": 3533,
+          "seconds": 3.88
+        },
+        {
+          "page": 258,
+          "source": "ocr",
+          "chars": 3705,
+          "seconds": 3.73
+        },
+        {
+          "page": 259,
+          "source": "ocr",
+          "chars": 3259,
+          "seconds": 2.37
+        }
+      ],
+      "total_chars": 13975,
+      "seconds": 14.38
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Multimedia Computing",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        260,
+        264
+      ],
+      "page_stats": [
+        {
+          "page": 260,
+          "source": "ocr",
+          "chars": 84,
+          "seconds": 0.5
+        },
+        {
+          "page": 261,
+          "source": "ocr",
+          "chars": 3062,
+          "seconds": 1.9
+        },
+        {
+          "page": 262,
+          "source": "ocr",
+          "chars": 3265,
+          "seconds": 2.59
+        },
+        {
+          "page": 263,
+          "source": "ocr",
+          "chars": 3556,
+          "seconds": 2.82
+        },
+        {
+          "page": 264,
+          "source": "ocr",
+          "chars": 2773,
+          "seconds": 2.47
+        }
+      ],
+      "total_chars": 12813,
+      "seconds": 10.28
+    },
+    {
+      "section": "Shared Undergraduate Curriculum",
+      "subsection": "Specialization Elective Courses - Data Science",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        265,
+        268
+      ],
+      "page_stats": [
+        {
+          "page": 265,
+          "source": "ocr",
+          "chars": 70,
+          "seconds": 0.52
+        },
+        {
+          "page": 266,
+          "source": "ocr",
+          "chars": 3197,
+          "seconds": 2.39
+        },
+        {
+          "page": 267,
+          "source": "ocr",
+          "chars": 3047,
+          "seconds": 3.97
+        },
+        {
+          "page": 268,
+          "source": "ocr",
+          "chars": 1485,
+          "seconds": 2.52
+        }
+      ],
+      "total_chars": 7857,
+      "seconds": 9.4
+    },
+    {
+      "section": "Industrial Training",
+      "subsection": "Industrial Training Guidelines",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        270,
+        280
+      ],
+      "page_stats": [
+        {
+          "page": 270,
+          "source": "native",
+          "chars": 2514,
+          "seconds": 0.01
+        },
+        {
+          "page": 271,
+          "source": "native",
+          "chars": 2402,
+          "seconds": 0.01
+        },
+        {
+          "page": 272,
+          "source": "native",
+          "chars": 2948,
+          "seconds": 0.01
+        },
+        {
+          "page": 273,
+          "source": "native",
+          "chars": 3534,
+          "seconds": 0.01
+        },
+        {
+          "page": 274,
+          "source": "native",
+          "chars": 703,
+          "seconds": 0.0
+        },
+        {
+          "page": 275,
+          "source": "ocr",
+          "chars": 776,
+          "seconds": 0.97
+        },
+        {
+          "page": 276,
+          "source": "native",
+          "chars": 178,
+          "seconds": 0.01
+        },
+        {
+          "page": 277,
+          "source": "native",
+          "chars": 2139,
+          "seconds": 0.01
+        },
+        {
+          "page": 278,
+          "source": "native",
+          "chars": 2934,
+          "seconds": 0.01
+        },
+        {
+          "page": 279,
+          "source": "native",
+          "chars": 1523,
+          "seconds": 0.01
+        },
+        {
+          "page": 280,
+          "source": "native",
+          "chars": 1754,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 21568,
+      "seconds": 1.07
+    },
+    {
+      "section": "Academic Project",
+      "subsection": "Academic Project I and II Guidelines",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        282,
+        289
+      ],
+      "page_stats": [
+        {
+          "page": 282,
+          "source": "native",
+          "chars": 1706,
+          "seconds": 0.01
+        },
+        {
+          "page": 283,
+          "source": "native",
+          "chars": 1708,
+          "seconds": 0.01
+        },
+        {
+          "page": 284,
+          "source": "native",
+          "chars": 1461,
+          "seconds": 0.01
+        },
+        {
+          "page": 285,
+          "source": "native",
+          "chars": 548,
+          "seconds": 0.01
+        },
+        {
+          "page": 286,
+          "source": "native",
+          "chars": 1732,
+          "seconds": 0.01
+        },
+        {
+          "page": 287,
+          "source": "native",
+          "chars": 1579,
+          "seconds": 0.01
+        },
+        {
+          "page": 288,
+          "source": "native",
+          "chars": 1302,
+          "seconds": 0.01
+        },
+        {
+          "page": 289,
+          "source": "native",
+          "chars": 672,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 10826,
+      "seconds": 0.09
+    },
+    {
+      "section": "Language Path and English Communication",
+      "subsection": "Language Path Course / English Communication Programme 2025/2026",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        292,
+        296
+      ],
+      "page_stats": [
+        {
+          "page": 292,
+          "source": "native_filtered",
+          "chars": 0,
+          "seconds": 0.77
+        },
+        {
+          "page": 293,
+          "source": "ocr",
+          "chars": 2721,
+          "seconds": 3.34
+        },
+        {
+          "page": 294,
+          "source": "ocr",
+          "chars": 1698,
+          "seconds": 2.07
+        },
+        {
+          "page": 295,
+          "source": "ocr",
+          "chars": 2450,
+          "seconds": 2.3
+        },
+        {
+          "page": 296,
+          "source": "ocr",
+          "chars": 2045,
+          "seconds": 1.94
+        }
+      ],
+      "total_chars": 8972,
+      "seconds": 10.42
+    },
+    {
+      "section": "Student Dress Code",
+      "subsection": "Dress Code and Appearance Guides for Universiti Malaya Students",
+      "scope_label": "general",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        297,
+        298
+      ],
+      "page_stats": [
+        {
+          "page": "297-298",
+          "source": "manual_visual_override",
+          "chars": 1202,
+          "seconds": 0.0
+        }
+      ],
+      "total_chars": 1202,
+      "seconds": 0.0
+    },
+    {
+      "section": "Undergraduate Rules and Regulations",
+      "subsection": "Examination Honesty and Discipline / Undergraduate Rules",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        299,
+        300
+      ],
+      "page_stats": [
+        {
+          "page": 299,
+          "source": "native_filtered",
+          "chars": 0,
+          "seconds": 0.57
+        },
+        {
+          "page": 300,
+          "source": "native",
+          "chars": 838,
+          "seconds": 0.01
+        }
+      ],
+      "total_chars": 851,
+      "seconds": 0.58
+    },
+    {
+      "section": "Examination Grading Scheme",
+      "subsection": "Official University Grades",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        301,
+        301
+      ],
+      "page_stats": [
+        {
+          "page": 301,
+          "source": "ocr",
+          "chars": 481,
+          "seconds": 0.76
+        }
+      ],
+      "total_chars": 494,
+      "seconds": 0.76
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Computer System and Network)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        202,
+        204
+      ],
+      "page_stats": [
+        {
+          "page": 202,
+          "source": "native",
+          "chars": 1916,
+          "seconds": 0.01
+        },
+        {
+          "page": 203,
+          "source": "ocr",
+          "chars": 974,
+          "seconds": 1.18
+        },
+        {
+          "page": 204,
+          "source": "ocr",
+          "chars": 2214,
+          "seconds": 2.39
+        }
+      ],
+      "total_chars": 5147,
+      "seconds": 3.58
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Artificial Intelligence)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        206,
+        208
+      ],
+      "page_stats": [
+        {
+          "page": 206,
+          "source": "native",
+          "chars": 2037,
+          "seconds": 0.01
+        },
+        {
+          "page": 207,
+          "source": "ocr",
+          "chars": 2403,
+          "seconds": 1.72
+        },
+        {
+          "page": 208,
+          "source": "ocr",
+          "chars": 2362,
+          "seconds": 2.05
+        }
+      ],
+      "total_chars": 6845,
+      "seconds": 3.78
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Information Systems)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        210,
+        212
+      ],
+      "page_stats": [
+        {
+          "page": 210,
+          "source": "native",
+          "chars": 2018,
+          "seconds": 0.0
+        },
+        {
+          "page": 211,
+          "source": "ocr",
+          "chars": 2743,
+          "seconds": 1.97
+        },
+        {
+          "page": 212,
+          "source": "ocr",
+          "chars": 2339,
+          "seconds": 1.62
+        }
+      ],
+      "total_chars": 7143,
+      "seconds": 3.59
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Software Engineering)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        214,
+        216
+      ],
+      "page_stats": [
+        {
+          "page": 214,
+          "source": "native",
+          "chars": 1948,
+          "seconds": 0.01
+        },
+        {
+          "page": 215,
+          "source": "ocr",
+          "chars": 2867,
+          "seconds": 1.93
+        },
+        {
+          "page": 216,
+          "source": "ocr",
+          "chars": 2170,
+          "seconds": 1.92
+        }
+      ],
+      "total_chars": 7028,
+      "seconds": 3.86
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Multimedia Computing)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        218,
+        220
+      ],
+      "page_stats": [
+        {
+          "page": 218,
+          "source": "native",
+          "chars": 2009,
+          "seconds": 0.01
+        },
+        {
+          "page": 219,
+          "source": "ocr",
+          "chars": 2673,
+          "seconds": 2.12
+        },
+        {
+          "page": 220,
+          "source": "ocr",
+          "chars": 2398,
+          "seconds": 2.0
+        }
+      ],
+      "total_chars": 7123,
+      "seconds": 4.13
+    },
+    {
+      "section": "Undergraduate Programme Goals and Learning Outcomes",
+      "subsection": "Bachelor of Computer Science (Data Science)",
+      "scope_label": "undergraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        222,
+        224
+      ],
+      "page_stats": [
+        {
+          "page": 222,
+          "source": "native",
+          "chars": 1685,
+          "seconds": 0.01
+        },
+        {
+          "page": 223,
+          "source": "ocr",
+          "chars": 1842,
+          "seconds": 1.61
+        },
+        {
+          "page": 224,
+          "source": "ocr",
+          "chars": 1979,
+          "seconds": 1.7
+        }
+      ],
+      "total_chars": 5549,
+      "seconds": 3.32
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        37,
+        38
+      ],
+      "page_stats": [
+        {
+          "page": 37,
+          "source": "ocr",
+          "chars": 2541,
+          "seconds": 1.54
+        },
+        {
+          "page": 38,
+          "source": "ocr",
+          "chars": 1535,
+          "seconds": 1.09
+        }
+      ],
+      "total_chars": 4102,
+      "seconds": 2.64
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "Programme Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        39,
+        40
+      ],
+      "page_stats": [
+        {
+          "page": 39,
+          "source": "ocr",
+          "chars": 2021,
+          "seconds": 1.55
+        },
+        {
+          "page": 40,
+          "source": "ocr",
+          "chars": 119,
+          "seconds": 0.42
+        }
+      ],
+      "total_chars": 2166,
+      "seconds": 1.97
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "Candidature Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        41,
+        41
+      ],
+      "page_stats": [
+        {
+          "page": 41,
+          "source": "ocr",
+          "chars": 449,
+          "seconds": 0.59
+        }
+      ],
+      "total_chars": 461,
+      "seconds": 0.59
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "Graduate on Time (GOT) Schedule",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        42,
+        42
+      ],
+      "page_stats": [
+        {
+          "page": 42,
+          "source": "ocr",
+          "chars": 2485,
+          "seconds": 1.45
+        }
+      ],
+      "total_chars": 2497,
+      "seconds": 1.45
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "Course Plan",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        43,
+        44
+      ],
+      "page_stats": [
+        {
+          "page": 43,
+          "source": "ocr",
+          "chars": 1060,
+          "seconds": 0.96
+        },
+        {
+          "page": 44,
+          "source": "ocr",
+          "chars": 1001,
+          "seconds": 0.94
+        }
+      ],
+      "total_chars": 2087,
+      "seconds": 1.9
+    },
+    {
+      "section": "Master of Computer Science (Applied Computing)",
+      "subsection": "List of Courses and Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        45,
+        50
+      ],
+      "page_stats": [
+        {
+          "page": 45,
+          "source": "ocr",
+          "chars": 934,
+          "seconds": 0.71
+        },
+        {
+          "page": 46,
+          "source": "ocr",
+          "chars": 2137,
+          "seconds": 1.73
+        },
+        {
+          "page": 47,
+          "source": "ocr",
+          "chars": 2038,
+          "seconds": 1.52
+        },
+        {
+          "page": 48,
+          "source": "ocr",
+          "chars": 2322,
+          "seconds": 1.73
+        },
+        {
+          "page": 49,
+          "source": "ocr",
+          "chars": 2567,
+          "seconds": 1.7
+        },
+        {
+          "page": 50,
+          "source": "ocr",
+          "chars": 758,
+          "seconds": 0.67
+        }
+      ],
+      "total_chars": 10838,
+      "seconds": 8.06
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        52,
+        53
+      ],
+      "page_stats": [
+        {
+          "page": 52,
+          "source": "ocr",
+          "chars": 2365,
+          "seconds": 1.45
+        },
+        {
+          "page": 53,
+          "source": "ocr",
+          "chars": 1714,
+          "seconds": 1.18
+        }
+      ],
+      "total_chars": 4105,
+      "seconds": 2.63
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "Programme Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        54,
+        55
+      ],
+      "page_stats": [
+        {
+          "page": 54,
+          "source": "ocr",
+          "chars": 1842,
+          "seconds": 1.78
+        },
+        {
+          "page": 55,
+          "source": "ocr",
+          "chars": 434,
+          "seconds": 0.53
+        }
+      ],
+      "total_chars": 2302,
+      "seconds": 2.31
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "Candidature Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        56,
+        56
+      ],
+      "page_stats": [
+        {
+          "page": 56,
+          "source": "ocr",
+          "chars": 528,
+          "seconds": 0.47
+        }
+      ],
+      "total_chars": 540,
+      "seconds": 0.47
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "Graduate on Time (GOT) Schedule",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        57,
+        57
+      ],
+      "page_stats": [
+        {
+          "page": 57,
+          "source": "ocr",
+          "chars": 1834,
+          "seconds": 1.49
+        }
+      ],
+      "total_chars": 1846,
+      "seconds": 1.49
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "Course Plan",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        59,
+        61
+      ],
+      "page_stats": [
+        {
+          "page": 59,
+          "source": "ocr",
+          "chars": 1178,
+          "seconds": 0.95
+        },
+        {
+          "page": 60,
+          "source": "ocr",
+          "chars": 1207,
+          "seconds": 1.19
+        },
+        {
+          "page": 61,
+          "source": "ocr",
+          "chars": 2744,
+          "seconds": 1.8
+        }
+      ],
+      "total_chars": 5169,
+      "seconds": 3.94
+    },
+    {
+      "section": "Master of Software Engineering (Software Technology)",
+      "subsection": "List of Courses and Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        62,
+        68
+      ],
+      "page_stats": [
+        {
+          "page": 62,
+          "source": "ocr",
+          "chars": 983,
+          "seconds": 0.8
+        },
+        {
+          "page": 63,
+          "source": "ocr",
+          "chars": 2055,
+          "seconds": 1.61
+        },
+        {
+          "page": 64,
+          "source": "ocr",
+          "chars": 1893,
+          "seconds": 1.35
+        },
+        {
+          "page": 65,
+          "source": "ocr",
+          "chars": 2481,
+          "seconds": 2.0
+        },
+        {
+          "page": 66,
+          "source": "ocr",
+          "chars": 1943,
+          "seconds": 1.63
+        },
+        {
+          "page": 67,
+          "source": "ocr",
+          "chars": 2598,
+          "seconds": 1.9
+        },
+        {
+          "page": 68,
+          "source": "ocr",
+          "chars": 1635,
+          "seconds": 1.33
+        }
+      ],
+      "total_chars": 13684,
+      "seconds": 10.63
+    },
+    {
+      "section": "Master in Data Science",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        70,
+        71
+      ],
+      "page_stats": [
+        {
+          "page": 70,
+          "source": "ocr",
+          "chars": 1886,
+          "seconds": 1.36
+        },
+        {
+          "page": 71,
+          "source": "ocr",
+          "chars": 670,
+          "seconds": 1.56
+        }
+      ],
+      "total_chars": 2582,
+      "seconds": 2.92
+    },
+    {
+      "section": "Master in Data Science",
+      "subsection": "Programme Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        72,
+        74
+      ],
+      "page_stats": [
+        {
+          "page": 72,
+          "source": "ocr",
+          "chars": 2078,
+          "seconds": 1.49
+        },
+        {
+          "page": 73,
+          "source": "ocr",
+          "chars": 117,
+          "seconds": 0.35
+        },
+        {
+          "page": 74,
+          "source": "ocr",
+          "chars": 455,
+          "seconds": 0.54
+        }
+      ],
+      "total_chars": 2690,
+      "seconds": 2.38
+    },
+    {
+      "section": "Master in Data Science",
+      "subsection": "Course Plan",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        75,
+        76
+      ],
+      "page_stats": [
+        {
+          "page": 75,
+          "source": "ocr",
+          "chars": 1310,
+          "seconds": 1.14
+        },
+        {
+          "page": 76,
+          "source": "ocr",
+          "chars": 1222,
+          "seconds": 0.97
+        }
+      ],
+      "total_chars": 2558,
+      "seconds": 2.12
+    },
+    {
+      "section": "Master in Data Science",
+      "subsection": "List of Courses and Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        77,
+        82
+      ],
+      "page_stats": [
+        {
+          "page": 77,
+          "source": "ocr",
+          "chars": 985,
+          "seconds": 0.8
+        },
+        {
+          "page": 78,
+          "source": "ocr",
+          "chars": 2765,
+          "seconds": 1.59
+        },
+        {
+          "page": 79,
+          "source": "ocr",
+          "chars": 2612,
+          "seconds": 1.51
+        },
+        {
+          "page": 80,
+          "source": "ocr",
+          "chars": 2480,
+          "seconds": 1.53
+        },
+        {
+          "page": 81,
+          "source": "ocr",
+          "chars": 2256,
+          "seconds": 1.43
+        },
+        {
+          "page": 82,
+          "source": "ocr",
+          "chars": 931,
+          "seconds": 0.71
+        }
+      ],
+      "total_chars": 12111,
+      "seconds": 7.58
+    },
+    {
+      "section": "Master of Cyber Security",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        84,
+        86
+      ],
+      "page_stats": [
+        {
+          "page": 84,
+          "source": "ocr",
+          "chars": 1084,
+          "seconds": 0.78
+        },
+        {
+          "page": 85,
+          "source": "ocr",
+          "chars": 2367,
+          "seconds": 1.28
+        },
+        {
+          "page": 86,
+          "source": "ocr",
+          "chars": 476,
+          "seconds": 0.52
+        }
+      ],
+      "total_chars": 3967,
+      "seconds": 2.58
+    },
+    {
+      "section": "Master of Cyber Security",
+      "subsection": "Programme Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        87,
+        88
+      ],
+      "page_stats": [
+        {
+          "page": 87,
+          "source": "ocr",
+          "chars": 1926,
+          "seconds": 1.22
+        },
+        {
+          "page": 88,
+          "source": "ocr",
+          "chars": 713,
+          "seconds": 0.68
+        }
+      ],
+      "total_chars": 2665,
+      "seconds": 1.91
+    },
+    {
+      "section": "Master of Cyber Security",
+      "subsection": "Course Plan",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        89,
+        90
+      ],
+      "page_stats": [
+        {
+          "page": 89,
+          "source": "ocr",
+          "chars": 1247,
+          "seconds": 1.04
+        },
+        {
+          "page": 90,
+          "source": "ocr",
+          "chars": 1260,
+          "seconds": 0.95
+        }
+      ],
+      "total_chars": 2533,
+      "seconds": 1.99
+    },
+    {
+      "section": "Master of Cyber Security",
+      "subsection": "List of Courses and Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        91,
+        97
+      ],
+      "page_stats": [
+        {
+          "page": 91,
+          "source": "ocr",
+          "chars": 1052,
+          "seconds": 0.79
+        },
+        {
+          "page": 92,
+          "source": "ocr",
+          "chars": 2352,
+          "seconds": 1.46
+        },
+        {
+          "page": 93,
+          "source": "ocr",
+          "chars": 2367,
+          "seconds": 1.34
+        },
+        {
+          "page": 94,
+          "source": "ocr",
+          "chars": 2098,
+          "seconds": 1.53
+        },
+        {
+          "page": 95,
+          "source": "ocr",
+          "chars": 2644,
+          "seconds": 2.9
+        },
+        {
+          "page": 96,
+          "source": "ocr",
+          "chars": 2141,
+          "seconds": 2.39
+        },
+        {
+          "page": 97,
+          "source": "ocr",
+          "chars": 716,
+          "seconds": 1.18
+        }
+      ],
+      "total_chars": 13466,
+      "seconds": 11.6
+    },
+    {
+      "section": "Master of Artificial Intelligence",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        99,
+        100
+      ],
+      "page_stats": [
+        {
+          "page": 99,
+          "source": "ocr",
+          "chars": 1426,
+          "seconds": 2.16
+        },
+        {
+          "page": 100,
+          "source": "ocr",
+          "chars": 983,
+          "seconds": 1.14
+        }
+      ],
+      "total_chars": 2436,
+      "seconds": 3.3
+    },
+    {
+      "section": "Master of Artificial Intelligence",
+      "subsection": "Programme Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        101,
+        102
+      ],
+      "page_stats": [
+        {
+          "page": 101,
+          "source": "ocr",
+          "chars": 1781,
+          "seconds": 2.87
+        },
+        {
+          "page": 102,
+          "source": "ocr",
+          "chars": 587,
+          "seconds": 0.73
+        }
+      ],
+      "total_chars": 2396,
+      "seconds": 3.6
+    },
+    {
+      "section": "Master of Artificial Intelligence",
+      "subsection": "Course Plan",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        103,
+        103
+      ],
+      "page_stats": [
+        {
+          "page": 103,
+          "source": "ocr",
+          "chars": 1442,
+          "seconds": 1.2
+        }
+      ],
+      "total_chars": 1455,
+      "seconds": 1.2
+    },
+    {
+      "section": "Master of Artificial Intelligence",
+      "subsection": "List of Courses and Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        104,
+        111
+      ],
+      "page_stats": [
+        {
+          "page": 104,
+          "source": "ocr",
+          "chars": 1110,
+          "seconds": 0.94
+        },
+        {
+          "page": 105,
+          "source": "ocr",
+          "chars": 2359,
+          "seconds": 1.53
+        },
+        {
+          "page": 106,
+          "source": "ocr",
+          "chars": 2484,
+          "seconds": 1.5
+        },
+        {
+          "page": 107,
+          "source": "ocr",
+          "chars": 1911,
+          "seconds": 1.15
+        },
+        {
+          "page": 108,
+          "source": "ocr",
+          "chars": 2235,
+          "seconds": 1.31
+        },
+        {
+          "page": 109,
+          "source": "ocr",
+          "chars": 2413,
+          "seconds": 1.46
+        },
+        {
+          "page": 110,
+          "source": "ocr",
+          "chars": 1976,
+          "seconds": 1.56
+        },
+        {
+          "page": 111,
+          "source": "ocr",
+          "chars": 1211,
+          "seconds": 0.95
+        }
+      ],
+      "total_chars": 15817,
+      "seconds": 10.39
+    },
+    {
+      "section": "Master of Computer Science (By Research)",
+      "subsection": "Programme Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        113,
+        113
+      ],
+      "page_stats": [
+        {
+          "page": 113,
+          "source": "ocr",
+          "chars": 1523,
+          "seconds": 1.33
+        }
+      ],
+      "total_chars": 1536,
+      "seconds": 1.33
+    },
+    {
+      "section": "Master of Computer Science (By Research)",
+      "subsection": "Learning Objectives and Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        114,
+        115
+      ],
+      "page_stats": [
+        {
+          "page": 114,
+          "source": "ocr",
+          "chars": 1233,
+          "seconds": 1.1
+        },
+        {
+          "page": 115,
+          "source": "ocr",
+          "chars": 500,
+          "seconds": 0.59
+        }
+      ],
+      "total_chars": 1761,
+      "seconds": 1.69
+    },
+    {
+      "section": "Master of Computer Science (By Research)",
+      "subsection": "Candidature Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        116,
+        116
+      ],
+      "page_stats": [
+        {
+          "page": 116,
+          "source": "ocr",
+          "chars": 1206,
+          "seconds": 0.91
+        }
+      ],
+      "total_chars": 1219,
+      "seconds": 0.91
+    },
+    {
+      "section": "Master of Computer Science (By Research)",
+      "subsection": "Graduate on Time (GOT) Schedule",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        117,
+        117
+      ],
+      "page_stats": [
+        {
+          "page": 117,
+          "source": "ocr",
+          "chars": 2103,
+          "seconds": 1.9
+        }
+      ],
+      "total_chars": 2116,
+      "seconds": 1.9
+    },
+    {
+      "section": "Master of Computer Science (By Research)",
+      "subsection": "Research Methodology / Course Contents",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        118,
+        118
+      ],
+      "page_stats": [
+        {
+          "page": 118,
+          "source": "ocr",
+          "chars": 1765,
+          "seconds": 1.34
+        }
+      ],
+      "total_chars": 1778,
+      "seconds": 1.34
+    },
+    {
+      "section": "Doctor of Philosophy",
+      "subsection": "Advanced Research Methods Course Content",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        120,
+        120
+      ],
+      "page_stats": [
+        {
+          "page": 120,
+          "source": "ocr",
+          "chars": 1210,
+          "seconds": 1.09
+        }
+      ],
+      "total_chars": 1223,
+      "seconds": 1.09
+    },
+    {
+      "section": "Doctor of Philosophy",
+      "subsection": "Programme Education Objectives",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        121,
+        121
+      ],
+      "page_stats": [
+        {
+          "page": 121,
+          "source": "ocr",
+          "chars": 336,
+          "seconds": 0.6
+        }
+      ],
+      "total_chars": 349,
+      "seconds": 0.6
+    },
+    {
+      "section": "Doctor of Philosophy",
+      "subsection": "Learning Outcomes",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        122,
+        122
+      ],
+      "page_stats": [
+        {
+          "page": 122,
+          "source": "ocr",
+          "chars": 588,
+          "seconds": 0.65
+        }
+      ],
+      "total_chars": 601,
+      "seconds": 0.65
+    },
+    {
+      "section": "Doctor of Philosophy",
+      "subsection": "Candidature Requirements",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        123,
+        123
+      ],
+      "page_stats": [
+        {
+          "page": 123,
+          "source": "ocr",
+          "chars": 1535,
+          "seconds": 2.15
+        }
+      ],
+      "total_chars": 1548,
+      "seconds": 2.15
+    },
+    {
+      "section": "Doctor of Philosophy",
+      "subsection": "Proposed Graduate on Time (GOT) Schedule",
+      "scope_label": "postgraduate",
+      "source_doc": "Complete Handbook",
+      "pages": [
+        124,
+        124
+      ],
+      "page_stats": [
+        {
+          "page": 124,
+          "source": "ocr",
+          "chars": 2280,
+          "seconds": 1.77
+        }
+      ],
+      "total_chars": 2293,
+      "seconds": 1.77
+    }
+  ],
+  "total_seconds": 313.89
+}

UM_Handbook/FineTune_QWEN3_UM_Handbook_optimized_1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

UM_Handbook/UM_Handbook_Markdown_Preprocess.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from pathlib import Path
+import json
+import re
+import time
+from typing import List
+import fitz
+import pytesseract
+from PIL import Image
+from um_handbook_config import (
+    GENERAL_PDF,
+    COMPLETE_PDF,
+    GENERAL_BLOCKS,
+    COMPLETE_BLOCKS,
+    DATA_ROOT,
+    MARKDOWN_DIR,
+    REPORTS_DIR,
+)
+PROJECT_DIR = Path(__file__).resolve().parent
+DATA_ROOT.mkdir(exist_ok=True)
+MARKDOWN_DIR.mkdir(exist_ok=True)
+REPORTS_DIR.mkdir(exist_ok=True)
+GENERAL_MD = MARKDOWN_DIR / "general_handbook_structured.md"
+COMPLETE_MD = MARKDOWN_DIR / "complete_handbook_structured.md"
+REPORT_PATH = REPORTS_DIR / "um_handbook_markdown_report.json"
+BAD_PAGE_PATTERNS = [
+    r"\bmore info\b",
+    r"fsktm[_\.]?um",
+    r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
+]
+def normalize_whitespace(text: str) -> str:
+    text = text.replace("\u00a0", " ").replace("\xad", "")
+    text = text.replace("ﬁ", "fi").replace("ﬂ", "fl")
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def clean_page_text(text: str) -> str:
+    lines = []
+    for raw in text.splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        if re.fullmatch(r"\d+", line):
+            continue
+        if len(line) == 1 and not line.isalnum():
+            continue
+        lines.append(line)
+    text = "\n".join(lines)
+    text = re.sub(r"(?m)^\s*[•▪●]\s*", "- ", text)
+    text = re.sub(r"(?m)^\s*([a-z])\)\s*", r"- ", text)
+    return normalize_whitespace(text)
+def ocr_page(page: fitz.Page, zoom: float = 1.5) -> str:
+    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    return pytesseract.image_to_string(img)
+def looks_like_noise_page(text: str) -> bool:
+    t = normalize_whitespace(text)
+    if not t:
+        return True
+    compact = t.replace("\n", " ")
+    alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
+    # real content pages normally have more than a handful of alphabetic words
+    if len(alpha_words) < 8:
+        return True
+    # low-information cover / contents / banner pages
+    for pattern in BAD_PAGE_PATTERNS:
+        if re.search(pattern, compact, flags=re.IGNORECASE):
+            if len(alpha_words) < 60:
+                return True
+    # very short all-caps dividers
+    if len(compact) < 120 and compact.upper() == compact and len(alpha_words) < 15:
+        return True
+    return False
+def extract_page_text(doc: fitz.Document, page_number_1_based: int) -> tuple[str, str]:
+    page = doc[page_number_1_based - 1]
+    native = clean_page_text(page.get_text("text"))
+    source = "native"
+    need_ocr = len(native) < 120 or looks_like_noise_page(native)
+    if need_ocr:
+        ocr_text = clean_page_text(ocr_page(page))
+        if len(ocr_text) > len(native):
+            native = ocr_text
+            source = "ocr"
+    if looks_like_noise_page(native):
+        return "", f"{source}_filtered"
+    return native, source
+def progress_bar(current: int, total: int, width: int = 28) -> str:
+    if total <= 0:
+        return "[no-progress]"
+    filled = int(width * current / total)
+    bar = "#" * filled + "-" * (width - filled)
+    pct = (current / total) * 100
+    return f"[{bar}] {current}/{total} ({pct:5.1f}%)"
+def block_to_markdown(doc: fitz.Document, block: dict, block_index: int, total_blocks: int, pdf_label: str) -> tuple[str, dict]:
+    start, end = block["pages"]
+    if block.get("manual_text"):
+        print()
+        print("=" * 90)
+        print(f"[BLOCK {block_index}/{total_blocks}] {pdf_label} | {block['section']} :: {block['subsection']} | pages {start}-{end} | MANUAL OVERRIDE")
+        print("=" * 90)
+        body = normalize_whitespace(block["manual_text"])
+        header = (
+            f"## {block['section']} :: {block['subsection']}\n\n"
+            f"- scope_label: {block['scope_label']}\n"
+            f"- source_doc: {block['source_doc']}\n"
+            f"- pages: {start}-{end}\n"
+        )
+        meta = {
+            "section": block["section"],
+            "subsection": block["subsection"],
+            "scope_label": block["scope_label"],
+            "source_doc": block["source_doc"],
+            "pages": [start, end],
+            "page_stats": [{"page": f"{start}-{end}", "source": "manual_visual_override", "chars": len(body), "seconds": 0.0}],
+            "total_chars": len(body),
+            "seconds": 0.0,
+        }
+        print(f"[DONE BLOCK] {block['section']} :: {block['subsection']} | MANUAL OVERRIDE")
+        return (header + ("\n" + body + "\n" if body else "\n"), meta)
+    pieces: List[str] = []
+    page_stats = []
+    block_start_time = time.time()
+    total_pages = end - start + 1
+    print()
+    print("=" * 90)
+    print(f"[BLOCK {block_index}/{total_blocks}] {pdf_label} | {block['section']} :: {block['subsection']} | pages {start}-{end}")
+    print("=" * 90)
+    for i, p in enumerate(range(start, end + 1), start=1):
+        page_start_time = time.time()
+        print(f"  {progress_bar(i, total_pages)} -> extracting page {p} ... ", end="", flush=True)
+        text, source = extract_page_text(doc, p)
+        elapsed = time.time() - page_start_time
+        if text:
+            pieces.append(f"### Page {p}\n{text}")
+        page_stats.append(
+            {
+                "page": p,
+                "source": source,
+                "chars": len(text),
+                "seconds": round(elapsed, 2),
+            }
+        )
+        print(f"{source.upper():12s} | chars={len(text):5d} | {elapsed:6.2f}s", flush=True)
+    block_elapsed = time.time() - block_start_time
+    body = "\n\n".join(pieces).strip()
+    header = (
+        f"## {block['section']} :: {block['subsection']}\n\n"
+        f"- scope_label: {block['scope_label']}\n"
+        f"- source_doc: {block['source_doc']}\n"
+        f"- pages: {start}-{end}\n"
+    )
+    meta = {
+        "section": block["section"],
+        "subsection": block["subsection"],
+        "scope_label": block["scope_label"],
+        "source_doc": block["source_doc"],
+        "pages": [start, end],
+        "page_stats": page_stats,
+        "total_chars": len(body),
+        "seconds": round(block_elapsed, 2),
+    }
+    print(f"[DONE BLOCK] {block['section']} :: {block['subsection']} | {block_elapsed:.2f}s")
+    return (header + ("\n" + body + "\n" if body else "\n"), meta)
+def write_markdown(pdf_path: Path, blocks: list[dict], out_path: Path, title: str) -> list[dict]:
+    print()
+    print("#" * 100)
+    print(f"[START] Building markdown for: {title}")
+    print(f"[PDF]   {pdf_path}")
+    print(f"[OUT]   {out_path}")
+    print(f"[BLOCKS] {len(blocks)}")
+    print("#" * 100)
+    sections = [f"# {title}", ""]
+    report_rows = []
+    start_time = time.time()
+    with fitz.open(pdf_path) as doc:
+        for idx, block in enumerate(blocks, start=1):
+            md, meta = block_to_markdown(
+                doc=doc,
+                block=block,
+                block_index=idx,
+                total_blocks=len(blocks),
+                pdf_label=pdf_path.name,
+            )
+            sections.append(md)
+            report_rows.append(meta)
+    out_path.write_text("\n\n".join(sections).strip() + "\n", encoding="utf-8")
+    elapsed = time.time() - start_time
+    print(f"[DONE FILE] {title} -> {out_path} | {elapsed:.2f}s")
+    return report_rows
+def main() -> None:
+    total_start = time.time()
+    print("[INFO] Markdown preprocess started")
+    print(f"[INFO] General PDF : {GENERAL_PDF}")
+    print(f"[INFO] Complete PDF: {COMPLETE_PDF}")
+    print(f"[INFO] General MD  : {GENERAL_MD}")
+    print(f"[INFO] Complete MD : {COMPLETE_MD}")
+    print(f"[INFO] Report Path : {REPORT_PATH}")
+    general_report = write_markdown(
+        GENERAL_PDF,
+        GENERAL_BLOCKS,
+        GENERAL_MD,
+        "General Handbook (Structured Markdown)",
+    )
+    complete_report = write_markdown(
+        COMPLETE_PDF,
+        COMPLETE_BLOCKS,
+        COMPLETE_MD,
+        "Complete Handbook (Structured Markdown)",
+    )
+    report = {
+        "generated_files": {
+            "general_markdown": str(GENERAL_MD),
+            "complete_markdown": str(COMPLETE_MD),
+        },
+        "general_block_count": len(general_report),
+        "complete_block_count": len(complete_report),
+        "general_blocks": general_report,
+        "complete_blocks": complete_report,
+        "total_seconds": round(time.time() - total_start, 2),
+    }
+    REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    print()
+    print("=" * 100)
+    print(f"Wrote: {GENERAL_MD}")
+    print(f"Wrote: {COMPLETE_MD}")
+    print(f"Wrote: {REPORT_PATH}")
+    print(f"[ALL DONE] Total time: {time.time() - total_start:.2f}s")
+    print("=" * 100)
+if __name__ == "__main__":
+    main()

UM_Handbook/UM_SFT_QA_Dataset_Builder_from_Index.py ADDED Viewed

	@@ -0,0 +1,620 @@

+from __future__ import annotations
+from pathlib import Path
+import json
+import re
+from typing import Any
+from um_handbook_config import DATA_ROOT, INDEX_DIR, CHUNKS_DIR, SFT_DIR
+PROJECT_DIR = Path(__file__).resolve().parent
+DATA_ROOT.mkdir(exist_ok=True)
+INDEX_DIR.mkdir(exist_ok=True)
+CHUNKS_DIR.mkdir(exist_ok=True)
+SFT_DIR.mkdir(exist_ok=True)
+INDEX_PATH = INDEX_DIR / "UM_Manual_Index.json"
+CHUNK_PATH = CHUNKS_DIR / "Source_Chunks_Dataset.jsonl"
+METADATA_PATH = SFT_DIR / "SFT_QA_Metadata_Draft.jsonl"
+METADATA_PRETTY_PATH = SFT_DIR / "SFT_QA_Metadata_Draft_pretty.json"
+TRAINING_READY_PATH = SFT_DIR / "SFT_QA_Training_Draft.jsonl"
+TRAINING_READY_PRETTY_PATH = SFT_DIR / "SFT_QA_Training_Draft_pretty.json"
+REPORT_PATH = SFT_DIR / "SFT_QA_Draft_Build_Report.json"
+MANUAL_QA_OVERRIDES = {
+    "UMI-0112": "The UM student dress code poster says that all Universiti Malaya students must follow the Universiti Malaya Administrative Directions (Student Dress Code and Appearance) 2024 while on campus. It illustrates three main attire contexts: formal or traditional formal attire for official events, neat and presentable campus attire for lectures, office matters, examinations, and library use, and sportswear for sports and recreational activities. The poster also states that non-compliance may lead to reprimand or other administrative action.",
+    "UMI-0379": "The poster emphasizes that students must comply with the Universiti Malaya Administrative Directions on dress code and appearance while on campus. The overall message is to dress in a neat, presentable, and context-appropriate way, with different attire illustrated for official events, normal academic or administrative settings, and sports or recreational activities.",
+    "UMI-0380": "The poster states that academic, administrative, library, and security staff are authorised to reprimand students verbally or in writing if they violate the dress code directions. It also says that a student who does not comply may be prevented from entering or dealing in areas where the provisions apply, and other administrative actions may be taken from time to time.",
+    "UMI-0381": "For official events, the poster illustrates formal attire, including suit-style clothing and traditional formal wear, to convey a neat and official appearance appropriate for formal university occasions.",
+    "UMI-0382": "For lectures, office matters, examinations, and library use, the poster illustrates neat and presentable campus attire rather than ceremonial or sports clothing. The examples shown are everyday academic or administrative outfits suitable for being on campus in those settings.",
+    "UMI-0383": "The poster distinguishes attire by activity. Sportswear is illustrated for sports and recreational activities, while official events use formal or traditional formal clothing, and lectures, office matters, examinations, and library use are shown with neat everyday campus attire. In other words, students are expected to dress according to the setting or activity."
+}
+BAD_ANSWER_PATTERNS = [
+    r"^###\s*Page\s+\d+",
+    r"\bmore info\b",
+    r"fsktm[_\.]?um",
+    r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
+]
+def normalize_text(text: str) -> str:
+    if text is None:
+        return ""
+    text = str(text).replace("\u00a0", " ").replace("\xad", "")
+    text = text.replace("ﬁ", "fi").replace("ﬂ", "fl")
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def normalize_for_compare(text: str) -> str:
+    text = normalize_text(text).lower()
+    text = re.sub(r"[^a-z0-9\s]+", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def tokenize(text: str) -> set[str]:
+    return set(t for t in normalize_for_compare(text).split() if len(t) >= 2)
+def safe_slug(text: str) -> str:
+    text = normalize_for_compare(text)
+    text = re.sub(r"\s+", "_", text)
+    return text[:80] if text else "item"
+def truncate_text(text: str, max_chars: int = 1200) -> str:
+    text = normalize_text(text)
+    if len(text) <= max_chars:
+        return text
+    cut = text[:max_chars].rstrip()
+    last_break = max(cut.rfind(". "), cut.rfind("\n"))
+    if last_break > 200:
+        return cut[: last_break + 1].strip()
+    return cut.strip() + " ..."
+def load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))
+def load_jsonl(path: Path) -> list[dict]:
+    rows = []
+    if not path.exists():
+        return rows
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+def write_jsonl(path: Path, rows: list[dict]) -> None:
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+def write_pretty_json(path: Path, rows: list[dict]) -> None:
+    path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
+def first_non_empty(d: dict, keys: list[str], default: Any = "") -> Any:
+    for k in keys:
+        if k in d and d[k] not in (None, "", [], {}):
+            return d[k]
+    return default
+def ensure_list(value: Any) -> list:
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return value
+    return [value]
+def extract_index_items(raw: Any) -> list[dict]:
+    if isinstance(raw, list):
+        return raw
+    if isinstance(raw, dict):
+        for key in ["entries", "items", "index", "records", "data", "rows"]:
+            value = raw.get(key)
+            if isinstance(value, list) and all(isinstance(x, dict) for x in value):
+                return value
+    raise ValueError(f"Unsupported index JSON structure in {INDEX_PATH}")
+def index_id(row: dict, idx: int) -> str:
+    return str(first_non_empty(row, ["index_id", "id", "question_id"], f"idx_{idx:05d}"))
+def index_question(row: dict) -> str:
+    return str(first_non_empty(row, ["canonical_question", "question", "core_question", "query"], "")).strip()
+def index_scope(row: dict) -> str:
+    return str(first_non_empty(row, ["scope_label", "scope", "label"], "")).strip().lower()
+def index_section(row: dict) -> str:
+    return str(first_non_empty(row, ["section"], "")).strip()
+def index_subsection(row: dict) -> str:
+    return str(first_non_empty(row, ["subsection", "sub_section"], "")).strip()
+def index_source_docs(row: dict) -> list[str]:
+    docs = first_non_empty(row, ["source_docs", "source_doc", "source_documents"], [])
+    return [str(x).strip() for x in ensure_list(docs) if str(x).strip()]
+def index_keywords(row: dict) -> list[str]:
+    kws = first_non_empty(row, ["keywords", "tags"], [])
+    return [str(x).strip() for x in ensure_list(kws) if str(x).strip()]
+def chunk_text(chunk: dict) -> str:
+    return normalize_text(first_non_empty(chunk, ["text", "chunk_text", "content", "body", "markdown_text"], ""))
+def chunk_id(chunk: dict, idx: int) -> str:
+    value = first_non_empty(chunk, ["chunk_id", "id"], "")
+    return str(value) if value else f"chunk_{idx:06d}"
+def chunk_scope(chunk: dict) -> str:
+    return str(first_non_empty(chunk, ["scope_label", "scope", "label"], "")).strip().lower()
+def chunk_source_doc(chunk: dict) -> str:
+    return str(first_non_empty(chunk, ["source_doc", "source_document", "doc_name"], "")).strip()
+def chunk_section(chunk: dict) -> str:
+    return str(first_non_empty(chunk, ["section"], "")).strip()
+def chunk_subsection(chunk: dict) -> str:
+    return str(first_non_empty(chunk, ["subsection", "sub_section"], "")).strip()
+def chunk_pages(chunk: dict) -> list[int]:
+    pages = first_non_empty(chunk, ["pages", "source_pages", "page_range"], [])
+    if isinstance(pages, list):
+        return pages
+    if isinstance(pages, tuple):
+        return list(pages)
+    return ensure_list(pages)
+def overlap_score(a: set[str], b: set[str]) -> int:
+    return len(a & b)
+def looks_like_bad_answer(text: str) -> bool:
+    t = normalize_text(text)
+    if not t:
+        return True
+    compact = t.replace("\n", " ")
+    for pattern in BAD_ANSWER_PATTERNS:
+        if re.search(pattern, compact, flags=re.IGNORECASE):
+            return True
+    alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
+    # allow short but valid statements such as vision / mission
+    if len(alpha_words) < 3:
+        return True
+    if len(compact) < 15:
+        return True
+    return False
+def score_chunk_for_index(index_row: dict, chunk: dict) -> tuple[int, dict]:
+    score = 0
+    reasons: dict[str, Any] = {}
+    q = index_question(index_row)
+    q_tokens = tokenize(q)
+    sec = index_section(index_row)
+    subsec = index_subsection(index_row)
+    scope = index_scope(index_row)
+    src_docs = index_source_docs(index_row)
+    kws = index_keywords(index_row)
+    c_text = chunk_text(chunk)
+    c_tokens = tokenize(c_text)
+    c_scope = chunk_scope(chunk)
+    c_src = chunk_source_doc(chunk)
+    c_sec = chunk_section(chunk)
+    c_subsec = chunk_subsection(chunk)
+    linked_ids = set(chunk.get("linked_index_ids", []))
+    if index_row.get("index_id") in linked_ids:
+        score += 100
+        reasons["linked_index_match"] = True
+    if scope and c_scope and scope == c_scope:
+        score += 30
+        reasons["scope_match"] = True
+    if sec and c_sec and normalize_for_compare(sec) == normalize_for_compare(c_sec):
+        score += 30
+        reasons["section_match"] = True
+    if subsec and c_subsec and normalize_for_compare(subsec) == normalize_for_compare(c_subsec):
+        score += 30
+        reasons["subsection_match"] = True
+    if src_docs:
+        for d in src_docs:
+            if normalize_for_compare(d) == normalize_for_compare(c_src):
+                score += 20
+                reasons["source_doc_match"] = True
+                break
+    kw_tokens = set()
+    for kw in kws:
+        kw_tokens |= tokenize(kw)
+    kw_overlap = overlap_score(kw_tokens, c_tokens)
+    if kw_overlap:
+        score += kw_overlap * 5
+        reasons["keyword_overlap"] = kw_overlap
+    qt_overlap = overlap_score(q_tokens, c_tokens)
+    if qt_overlap:
+        score += qt_overlap * 3
+        reasons["question_overlap"] = qt_overlap
+    heading_tokens = tokenize(f"{c_sec} {c_subsec}")
+    heading_overlap = overlap_score(q_tokens | kw_tokens, heading_tokens)
+    if heading_overlap:
+        score += heading_overlap * 8
+        reasons["heading_overlap"] = heading_overlap
+    return score, reasons
+def choose_best_chunk(index_row: dict, chunks: list[dict]) -> tuple[dict | None, dict]:
+    idxid = index_row.get("index_id")
+    linked_candidates = [c for c in chunks if idxid in set(c.get("linked_index_ids", []))]
+    # Strong preference: if exact linked candidates exist, only score within that subset.
+    candidates = linked_candidates if linked_candidates else chunks
+    best_chunk = None
+    best_score = -1
+    best_reasons: dict[str, Any] = {}
+    for chunk in candidates:
+        score, reasons = score_chunk_for_index(index_row, chunk)
+        if score > best_score:
+            best_chunk = chunk
+            best_score = score
+            best_reasons = reasons
+    if best_chunk is None:
+        return None, {"best_score": -1, "match_reasons": {}}
+    # When exact linked candidates exist, allow a lower threshold because section/subsection mapping
+    # is already controlled by the chunk builder. Otherwise require a stricter score.
+    min_required = 120 if linked_candidates else 160
+    if best_score < min_required:
+        return None, {"best_score": best_score, "match_reasons": best_reasons}
+    return best_chunk, {"best_score": best_score, "match_reasons": best_reasons}
+def extract_identity_line(lines: list[str], question_norm: str) -> str:
+    """
+    Handle faculty identity pages that may not preserve explicit VISION / MISSION labels
+    after PDF extraction. In some handbook pages the two statement lines appear before
+    the labels or with spaced-out labels such as "v i s i o n".
+    """
+    cleaned = []
+    for ln in lines:
+        ln = normalize_text(ln)
+        if not ln:
+            continue
+        if re.fullmatch(r"\d+", ln):
+            continue
+        if re.fullmatch(r"[vmiohsnaetcrpbjdu ]+", ln.lower()):
+            continue
+        cleaned.append(ln)
+    def maybe_join(idx: int) -> str:
+        line = cleaned[idx]
+        if idx + 1 < len(cleaned):
+            nxt = cleaned[idx + 1]
+            if len(nxt.split()) <= 8 and not nxt.lower().startswith("to "):
+                if line.endswith(("through", "and", "for", "to", "of")):
+                    return f"{line} {nxt}".strip()
+        return line
+    if " vision " in question_norm:
+        for i, ln in enumerate(cleaned):
+            if ln.lower().startswith("a ") and "faculty" in ln.lower():
+                return truncate_text(maybe_join(i), 300)
+    if " mission " in question_norm:
+        for i, ln in enumerate(cleaned):
+            if ln.lower().startswith("to "):
+                return truncate_text(maybe_join(i), 300)
+    return ""
+def extract_labeled_answer(text: str, label: str, stop_labels: list[str]) -> str:
+    stop_group = "|".join(re.escape(x) for x in stop_labels)
+    pattern = rf"\b{re.escape(label)}\b\s*(.+?)(?:\b(?:{stop_group})\b|$)"
+    m = re.search(pattern, text, flags=re.IGNORECASE | re.DOTALL)
+    if not m:
+        return ""
+    ans = normalize_text(m.group(1)).replace("\n", " ")
+    ans = re.sub(r"\s{2,}", " ", ans).strip()
+    return truncate_text(ans, 600)
+def score_segment(segment: str, question: str, keywords: list[str]) -> int:
+    seg_tokens = tokenize(segment)
+    score = overlap_score(seg_tokens, tokenize(question)) * 3
+    kw_tokens = set()
+    for kw in keywords:
+        kw_tokens |= tokenize(kw)
+    score += overlap_score(seg_tokens, kw_tokens) * 5
+    return score
+def extract_answer_from_chunk(index_row: dict, chunk: dict) -> str:
+    text = chunk_text(chunk)
+    q = index_question(index_row)
+    q_norm = " " + normalize_for_compare(q) + " "
+    kws = index_keywords(index_row)
+    # Explicit handling for faculty identity pages
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if " vision " in q_norm:
+        ans = extract_labeled_answer(text, "VISION", ["MISSION", "OBJECTIVES", "HISTORY", "STAFF"])
+        if ans and not looks_like_bad_answer(ans):
+            return ans
+        ans = extract_identity_line(lines, q_norm)
+        if ans and not looks_like_bad_answer(ans):
+            return ans
+    if " mission " in q_norm:
+        ans = extract_labeled_answer(text, "MISSION", ["OBJECTIVES", "HISTORY", "STAFF"])
+        if ans and not looks_like_bad_answer(ans):
+            return ans
+        ans = extract_identity_line(lines, q_norm)
+        if ans and not looks_like_bad_answer(ans):
+            return ans
+    if " objective " in q_norm or " objectives " in q_norm:
+        ans = extract_labeled_answer(text, "OBJECTIVES", ["HISTORY", "STAFF", "PROGRAMME", "ACADEMIC CALENDAR"])
+        if ans and not looks_like_bad_answer(ans):
+            return ans
+    # Fallback: choose best short segment(s)
+    segments = []
+    if lines:
+        for i in range(len(lines)):
+            for span in (1, 2, 3):
+                piece = " ".join(lines[i:i + span]).strip()
+                if piece:
+                    segments.append(piece)
+    else:
+        segments = re.split(r"(?<=[.!?])\s+", text)
+    scored = []
+    for seg in segments:
+        seg = normalize_text(seg)
+        if not seg or looks_like_bad_answer(seg):
+            continue
+        score = score_segment(seg, q, kws)
+        if score > 0:
+            scored.append((score, seg))
+    if not scored:
+        cleaned = truncate_text(text, 900)
+        return "" if looks_like_bad_answer(cleaned) else cleaned
+    scored.sort(key=lambda x: (-x[0], len(x[1])))
+    chosen = []
+    seen = set()
+    for _, seg in scored:
+        seg_key = normalize_for_compare(seg)
+        if seg_key in seen:
+            continue
+        chosen.append(seg)
+        seen.add(seg_key)
+        if len(" ".join(chosen)) > 450 or len(chosen) >= 3:
+            break
+    answer = normalize_text(" ".join(chosen))
+    answer = truncate_text(answer, 700)
+    return "" if looks_like_bad_answer(answer) else answer
+def manual_override_answer(index_row: dict) -> str:
+    idx = str(index_row.get("index_id", "")).strip()
+    return MANUAL_QA_OVERRIDES.get(idx, "")
+def build_metadata_row(index_row: dict, chosen_chunk: dict | None, match_meta: dict, qa_idx: int) -> dict:
+    qa_id = f"qa_{qa_idx:06d}"
+    idx_id = index_id(index_row, qa_idx)
+    scope = index_scope(index_row)
+    sec = index_section(index_row)
+    subsec = index_subsection(index_row)
+    src_docs = index_source_docs(index_row)
+    kws = index_keywords(index_row)
+    if chosen_chunk is None:
+        manual_answer = manual_override_answer(index_row)
+        return {
+            "qa_id": qa_id,
+            "index_id": idx_id,
+            "question": index_question(index_row),
+            "answer": manual_answer,
+            "scope_label": scope,
+            "source_doc": src_docs[0] if src_docs else "",
+            "section": sec,
+            "subsection": subsec,
+            "chunk_id": "",
+            "source_pages": [],
+            "keywords": kws,
+            "source_docs_from_index": src_docs,
+            "retrieval_tags": [x for x in [scope, safe_slug(sec), safe_slug(subsec)] if x],
+            "manual_review_priority": first_non_empty(index_row, ["manual_review_priority", "priority"], "normal"),
+            "review_status": "manual_visual_override" if manual_answer else "unmatched",
+            "match_score": match_meta.get("best_score", -1),
+            "match_reasons": match_meta.get("match_reasons", {}),
+            "notes": first_non_empty(index_row, ["note", "notes"], ""),
+        }
+    answer = manual_override_answer(index_row)
+    status = "manual_visual_override" if answer else "matched_needs_review"
+    if not answer:
+        answer = extract_answer_from_chunk(index_row, chosen_chunk)
+        if not answer:
+            status = "bad_match_filtered"
+    return {
+        "qa_id": qa_id,
+        "index_id": idx_id,
+        "question": index_question(index_row),
+        "answer": answer,
+        "scope_label": scope,
+        "source_doc": chunk_source_doc(chosen_chunk),
+        "section": sec,
+        "subsection": subsec,
+        "chunk_id": chosen_chunk.get("chunk_id", ""),
+        "source_pages": chunk_pages(chosen_chunk),
+        "keywords": kws,
+        "source_docs_from_index": src_docs,
+        "retrieval_tags": [x for x in [scope, safe_slug(sec), safe_slug(subsec)] if x],
+        "manual_review_priority": first_non_empty(index_row, ["manual_review_priority", "priority"], "normal"),
+        "review_status": status,
+        "match_score": match_meta.get("best_score", -1),
+        "match_reasons": match_meta.get("match_reasons", {}),
+        "notes": first_non_empty(index_row, ["note", "notes"], ""),
+    }
+def build_training_row(metadata_row: dict) -> dict:
+    return {
+        "qa_id": metadata_row["qa_id"],
+        "index_id": metadata_row["index_id"],
+        "question": metadata_row["question"],
+        "answer": metadata_row["answer"],
+    }
+def main() -> None:
+    print("[INFO] Loading index...")
+    raw_index = load_json(INDEX_PATH)
+    index_rows = extract_index_items(raw_index)
+    print(f"[INFO] Loaded index items: {len(index_rows)}")
+    print("[INFO] Loading chunks...")
+    chunks = load_jsonl(CHUNK_PATH)
+    print(f"[INFO] Loaded chunk rows: {len(chunks)}")
+    normalized_chunks = []
+    for i, ch in enumerate(chunks, start=1):
+        row = dict(ch)
+        if not row.get("chunk_id"):
+            row["chunk_id"] = chunk_id(row, i)
+        normalized_chunks.append(row)
+    metadata_rows = []
+    training_rows = []
+    unmatched_count = 0
+    matched_count = 0
+    filtered_bad_match_count = 0
+    for i, idx_row in enumerate(index_rows, start=1):
+        idx_row = dict(idx_row)
+        if not idx_row.get("index_id"):
+            idx_row["index_id"] = index_id(idx_row, i)
+        chosen_chunk, match_meta = choose_best_chunk(idx_row, normalized_chunks)
+        metadata_row = build_metadata_row(idx_row, chosen_chunk, match_meta, i)
+        metadata_rows.append(metadata_row)
+        if metadata_row["review_status"] == "unmatched":
+            unmatched_count += 1
+            continue
+        if metadata_row["review_status"] == "bad_match_filtered" or not metadata_row["answer"]:
+            filtered_bad_match_count += 1
+            continue
+        training_rows.append(build_training_row(metadata_row))
+        matched_count += 1
+    write_jsonl(METADATA_PATH, metadata_rows)
+    write_pretty_json(METADATA_PRETTY_PATH, metadata_rows)
+    write_jsonl(TRAINING_READY_PATH, training_rows)
+    write_pretty_json(TRAINING_READY_PRETTY_PATH, training_rows)
+    report = {
+        "stage": "baseline_1",
+        "format": "question_answer_only",
+        "inputs": {
+            "index_path": str(INDEX_PATH),
+            "chunk_path": str(CHUNK_PATH),
+        },
+        "outputs": {
+            "metadata_path": str(METADATA_PATH),
+            "metadata_pretty_path": str(METADATA_PRETTY_PATH),
+            "training_ready_path": str(TRAINING_READY_PATH),
+            "training_ready_pretty_path": str(TRAINING_READY_PRETTY_PATH),
+        },
+        "counts": {
+            "index_rows": len(index_rows),
+            "chunk_rows": len(normalized_chunks),
+            "metadata_rows": len(metadata_rows),
+            "training_ready_rows": len(training_rows),
+            "matched_rows": matched_count,
+            "unmatched_rows": unmatched_count,
+            "filtered_bad_match_rows": filtered_bad_match_count,
+        },
+        "notes": [
+            "This build is for Baseline 1 only.",
+            "Training-ready rows contain only question and answer fields.",
+            "Exact linked_index_id candidates are preferred when available.",
+            "Bad cover/content/heading-only answers are filtered out.",
+            "Vision/Mission/Objectives questions use explicit label-aware extraction when possible.",
+        ],
+    }
+    REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Wrote: {METADATA_PATH}")
+    print(f"Wrote: {METADATA_PRETTY_PATH}")
+    print(f"Wrote: {TRAINING_READY_PATH}")
+    print(f"Wrote: {TRAINING_READY_PRETTY_PATH}")
+    print(f"Wrote: {REPORT_PATH}")
+if __name__ == "__main__":
+    main()

UM_Handbook/UM_Source_Chunk_Dataset_Builder.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from pathlib import Path
+import json
+import re
+from collections import defaultdict
+from um_handbook_config import DATA_ROOT, MARKDOWN_DIR, INDEX_DIR, CHUNKS_DIR
+PROJECT_DIR = Path(__file__).resolve().parent
+DATA_ROOT.mkdir(exist_ok=True)
+MARKDOWN_DIR.mkdir(exist_ok=True)
+INDEX_DIR.mkdir(exist_ok=True)
+CHUNKS_DIR.mkdir(exist_ok=True)
+GENERAL_MD = MARKDOWN_DIR / "general_handbook_structured.md"
+COMPLETE_MD = MARKDOWN_DIR / "complete_handbook_structured.md"
+INDEX_PATH = INDEX_DIR / "UM_Manual_Index.json"
+OUT_JSONL = CHUNKS_DIR / "Source_Chunks_Dataset.jsonl"
+OUT_PRETTY = CHUNKS_DIR / "Source_Chunks_Dataset_pretty.json"
+OUT_REPORT = CHUNKS_DIR / "Source_Chunks_Dataset_report.json"
+STOPWORDS = {
+    "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "from", "is", "are", "was", "were", "what", "which",
+    "who", "how", "when", "where", "why", "this", "that", "these", "those", "their", "there", "into", "about", "under",
+    "through", "using", "used", "students", "student", "programme", "program", "handbook", "faculty", "computer", "science",
+    "information", "technology", "universiti", "malaya",
+}
+SPECIAL_SINGLE_CHUNK_SECTIONS = {"Student Dress Code"}
+BAD_CHUNK_PATTERNS = [
+    r"\bmore info\b",
+    r"fsktm[_\.]?um",
+    r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
+    r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
+]
+def normalize_text(text: str) -> str:
+    text = text.replace("\u00a0", " ").replace("\xad", "")
+    text = text.replace("ﬁ", "fi").replace("ﬂ", "fl")
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def read_markdown_sections(path: Path) -> list[dict]:
+    text = path.read_text(encoding="utf-8")
+    raw_sections = re.split(r"(?m)^## ", text)
+    sections = []
+    for part in raw_sections:
+        if not part.strip() or part.startswith("# "):
+            continue
+        lines = part.splitlines()
+        title = lines[0].strip()
+        meta = {"title": title}
+        i = 1
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+        while i < len(lines) and lines[i].startswith("- "):
+            k, _, v = lines[i][2:].partition(":")
+            meta[k.strip()] = v.strip()
+            i += 1
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+        body = "\n".join(lines[i:]).strip()
+        meta["body"] = body
+        section, _, subsection = title.partition("::")
+        meta["section"] = section.strip()
+        meta["subsection"] = subsection.strip()
+        sections.append(meta)
+    return sections
+def parse_pages(meta_pages: str) -> list[int]:
+    if not meta_pages:
+        return []
+    m = re.match(r"^(\d+)-(\d+)$", meta_pages.strip())
+    if m:
+        a, b = int(m.group(1)), int(m.group(2))
+        return list(range(a, b + 1))
+    vals = re.findall(r"\d+", meta_pages)
+    return [int(v) for v in vals]
+def split_body_by_pages(body: str, fallback_pages: list[int]) -> list[tuple[list[int], str]]:
+    body = body.strip()
+    matches = list(re.finditer(r"(?m)^### Page (\d+)\s*$", body))
+    if not matches:
+        return [(fallback_pages, body)]
+    parts = []
+    for idx, match in enumerate(matches):
+        page_no = int(match.group(1))
+        start = match.end()
+        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
+        text = body[start:end].strip()
+        parts.append(([page_no], text))
+    return parts
+def clean_text(text: str) -> str:
+    lines = []
+    for raw in text.splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        if re.fullmatch(r"### Page \d+", line):
+            continue
+        lines.append(line)
+    text = "\n".join(lines)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    return normalize_text(text)
+def looks_like_bad_chunk(text: str) -> bool:
+    t = normalize_text(text)
+    if not t:
+        return True
+    compact = t.replace("\n", " ")
+    alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
+    if len(alpha_words) < 12:
+        return True
+    for pattern in BAD_CHUNK_PATTERNS:
+        if re.search(pattern, compact, flags=re.IGNORECASE):
+            if len(alpha_words) < 70:
+                return True
+    if len(compact) < 140 and compact.upper() == compact:
+        return True
+    return False
+def chunk_lines(text: str, max_chars: int = 1100) -> list[str]:
+    """
+    Keep chunks moderately small so answer extraction later can stay focused.
+    """
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if not lines:
+        return []
+    chunks = []
+    current = []
+    current_len = 0
+    for line in lines:
+        addition = len(line) + (1 if current else 0)
+        if current and current_len + addition > max_chars:
+            chunks.append("\n".join(current).strip())
+            current = [line]
+            current_len = len(line)
+        else:
+            current.append(line)
+            current_len += addition
+    if current:
+        chunks.append("\n".join(current).strip())
+    return chunks
+def keywords_from_text(text: str, limit: int = 12) -> list[str]:
+    words = re.findall(r"[A-Za-z][A-Za-z0-9&/\-']+", text.lower())
+    freq = defaultdict(int)
+    for w in words:
+        if w in STOPWORDS or len(w) < 3:
+            continue
+        freq[w] += 1
+    return [w for w, _ in sorted(freq.items(), key=lambda x: (-x[1], x[0]))[:limit]]
+def load_index_entries() -> list[dict]:
+    data = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
+    if isinstance(data, dict) and "entries" in data:
+        return data["entries"]
+    raise ValueError(f"Unsupported Manual_Index JSON format in {INDEX_PATH}")
+def section_key(section: str, subsection: str) -> str:
+    return f"{section.strip()}::{subsection.strip()}"
+def main() -> None:
+    if not GENERAL_MD.exists() or not COMPLETE_MD.exists():
+        raise FileNotFoundError("Run UM_Handbook_Markdown_Preprocess.py first.")
+    index_entries = load_index_entries()
+    index_by_key = defaultdict(list)
+    for entry in index_entries:
+        index_by_key[section_key(entry["section"], entry["subsection"])].append(entry["index_id"])
+    rows = []
+    markdown_sections = read_markdown_sections(GENERAL_MD) + read_markdown_sections(COMPLETE_MD)
+    for sec in markdown_sections:
+        fallback_pages = parse_pages(sec.get("pages", ""))
+        linked = index_by_key.get(section_key(sec["section"], sec["subsection"]), [])
+        page_parts = split_body_by_pages(sec["body"], fallback_pages)
+        chunk_index = 0
+        for page_list, page_text in page_parts:
+            page_text = clean_text(page_text)
+            if looks_like_bad_chunk(page_text):
+                continue
+            candidate_chunks = [page_text] if sec.get("section") in SPECIAL_SINGLE_CHUNK_SECTIONS else chunk_lines(page_text)
+            for chunk in candidate_chunks:
+                chunk = clean_text(chunk)
+                if looks_like_bad_chunk(chunk):
+                    continue
+                chunk_index += 1
+                row = {
+                    "chunk_id": f"SC-{len(rows)+1:05d}",
+                    "source_doc": sec.get("source_doc"),
+                    "scope_label": sec.get("scope_label"),
+                    "section": sec.get("section"),
+                    "subsection": sec.get("subsection"),
+                    "pages": page_list or fallback_pages,
+                    "chunk_index": chunk_index,
+                    "text": chunk,
+                    "keywords": keywords_from_text(chunk),
+                    "linked_index_ids": linked,
+                }
+                rows.append(row)
+    with OUT_JSONL.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+    OUT_PRETTY.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
+    scope_distribution = {
+        k: sum(1 for r in rows if r["scope_label"] == k)
+        for k in sorted({r["scope_label"] for r in rows})
+    }
+    report = {
+        "total_chunks": len(rows),
+        "scope_distribution": scope_distribution,
+        "notes": [
+            "Chunks are generated from the structured markdown files, not directly from raw PDF pages.",
+            "Low-information cover/content/divider chunks are filtered out.",
+            "Chunk pages are preserved from per-page markdown markers when available.",
+            "Linked Manual_Index ids are based on exact section/subsection matches from UM_Manual_Index.json.",
+        ],
+    }
+    OUT_REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Wrote: {OUT_JSONL}")
+    print(f"Wrote: {OUT_PRETTY}")
+    print(f"Wrote: {OUT_REPORT}")
+if __name__ == "__main__":
+    main()

UM_Handbook/assets/TensorCat.png ADDED Viewed

UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.3.0"
+}

UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e946ac23b6a68f7a2abbe7b3c22190673c6d3d159b85305268db51b2729ac68a
+size 11422749

UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

UM_Handbook/um_handbook_config.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from pathlib import Path
+PROJECT_DIR = Path(__file__).resolve().parent
+# Total data directory
+DATA_ROOT = PROJECT_DIR / "Dataset"
+# Subdirectories (kept consistent with the existing workflow)
+PDF_DIR = DATA_ROOT / "pdf"
+MARKDOWN_DIR = DATA_ROOT / "markdown"
+INDEX_DIR = DATA_ROOT / "Manual_Index"
+CHUNKS_DIR = DATA_ROOT / "Source Chunk Dataset"
+SFT_DIR = DATA_ROOT / "SFT_Dataset_Draft"
+REPORTS_DIR = DATA_ROOT / "reports"
+# Input PDFs
+GENERAL_PDF = PDF_DIR / "General Handbook.pdf"
+COMPLETE_PDF = PDF_DIR / "Complete Handbook.pdf"
+DRESS_CODE_MANUAL_TEXT = """
+UM STUDENT DRESS CODE AND APPEARANCE POSTER SUMMARY
+Compliance message
+- All Universiti Malaya students must adhere to the Universiti Malaya Administrative Directions (Student Dress Code and Appearance) 2024 while on campus.
+Illustrated attire categories on the poster
+- Official Events: the poster illustrates formal or traditional formal attire for official university occasions.
+- Lectures, Office Matters, Examination and Library: the poster illustrates neat, presentable campus attire for normal academic and administrative settings.
+- Sports and Recreational: the poster illustrates sportswear for sports and recreational activities.
+Enforcement and action
+- Academic, administrative, library and security staff members are authorised to reprimand students verbally or in writing if they violate the Administrative Directions.
+- A student who does not comply may be prevented from entering or dealing in areas where the provisions apply.
+- Other administrative actions may also be taken from time to time.
+Important limitation
+- This poster illustrates categories of appropriate attire and enforcement expectations, but it does not provide an exhaustive item-by-item prohibited clothing list.
+"""
+# ----------------------------
+# General handbook blocks
+# NOTE:
+# This PDF contains two handbook-style front sections. The current workflow
+# intentionally uses the later normalized pages (e.g. 9, 10, 11...) for the
+# "General Handbook" layer, because those pages contain the cleaner normalized
+# general/common content that matches the current index design.
+# ----------------------------
+GENERAL_BLOCKS = [
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "Faculty Objectives", "subsection": "Faculty Objectives", "pages": (9, 9)},
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "History of the Faculty", "subsection": "History Overview", "pages": (10, 11)},
+    {"source_doc": "General Handbook", "scope_label": "postgraduate", "section": "Academic Calendar 2025/2026", "subsection": "Master and Doctorate Level Academic Calendar", "pages": (4, 4)},
+    {"source_doc": "General Handbook", "scope_label": "undergraduate", "section": "Academic Calendar 2025/2026", "subsection": "Bachelor Degree Level Academic Calendar", "pages": (12, 12)},
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Teaching Labs", "pages": (13, 14)},
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Research Labs", "pages": (14, 16)},
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Project Based Labs", "pages": (16, 16)},
+    {"source_doc": "General Handbook", "scope_label": "general", "section": "Other Facilities", "subsection": "Student Support and Campus Facilities", "pages": (17, 17)},
+]
+# ----------------------------
+# Postgraduate programme blocks
+# Pages here are PDF physical pages, not handbook-printed page numbers.
+# These were aligned against the uploaded merged Complete Handbook PDF.
+# ----------------------------
+PG_PROGRAMMES = [
+    {
+        "code": "PG-AC",
+        "name": "Master of Computer Science (Applied Computing)",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (37, 38)),
+            ("Programme Objectives and Outcomes", (39, 40)),
+            ("Candidature Requirements", (41, 41)),
+            ("Graduate on Time (GOT) Schedule", (42, 42)),
+            ("Course Plan", (43, 44)),
+            ("List of Courses and Contents", (45, 50)),
+        ],
+    },
+    {
+        "code": "PG-SE",
+        "name": "Master of Software Engineering (Software Technology)",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (52, 53)),
+            ("Programme Objectives and Outcomes", (54, 55)),
+            ("Candidature Requirements", (56, 56)),
+            ("Graduate on Time (GOT) Schedule", (57, 57)),
+            ("Course Plan", (59, 61)),
+            ("List of Courses and Contents", (62, 68)),
+        ],
+    },
+    {
+        "code": "PG-DS",
+        "name": "Master in Data Science",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (70, 71)),
+            ("Programme Objectives and Outcomes", (72, 74)),
+            ("Course Plan", (75, 76)),
+            ("List of Courses and Contents", (77, 82)),
+        ],
+    },
+    {
+        "code": "PG-CSY",
+        "name": "Master of Cyber Security",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (84, 86)),
+            ("Programme Objectives and Outcomes", (87, 88)),
+            ("Course Plan", (89, 90)),
+            ("List of Courses and Contents", (91, 97)),
+        ],
+    },
+    {
+        "code": "PG-AI",
+        "name": "Master of Artificial Intelligence",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (99, 100)),
+            ("Programme Objectives and Outcomes", (101, 102)),
+            ("Course Plan", (103, 103)),
+            ("List of Courses and Contents", (104, 111)),
+        ],
+    },
+    {
+        "code": "PG-MR",
+        "name": "Master of Computer Science (By Research)",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Programme Requirements", (113, 113)),
+            ("Learning Objectives and Outcomes", (114, 115)),
+            ("Candidature Requirements", (116, 116)),
+            ("Graduate on Time (GOT) Schedule", (117, 117)),
+            ("Research Methodology / Course Contents", (118, 118)),
+        ],
+    },
+    {
+        "code": "PG-PHD",
+        "name": "Doctor of Philosophy",
+        "scope_label": "postgraduate",
+        "blocks": [
+            ("Advanced Research Methods Course Content", (120, 120)),
+            ("Programme Education Objectives", (121, 121)),
+            ("Learning Outcomes", (122, 122)),
+            ("Candidature Requirements", (123, 123)),
+            ("Proposed Graduate on Time (GOT) Schedule", (124, 124)),
+        ],
+    },
+]
+UG_PROGRAMMES = [
+    ("UG-CSN", "Bachelor of Computer Science (Computer System and Network)", (202, 204)),
+    ("UG-AI", "Bachelor of Computer Science (Artificial Intelligence)", (206, 208)),
+    ("UG-IS", "Bachelor of Computer Science (Information Systems)", (210, 212)),
+    ("UG-SE", "Bachelor of Computer Science (Software Engineering)", (214, 216)),
+    ("UG-MM", "Bachelor of Computer Science (Multimedia Computing)", (218, 220)),
+    ("UG-DS", "Bachelor of Computer Science (Data Science)", (222, 224)),
+]
+# ----------------------------
+# Complete handbook blocks
+#
+# IMPORTANT VERIFIED FIX:
+# In the uploaded merged Complete Handbook PDF:
+#   - PDF page 186 contains the postgraduate-style Vision/Mission page:
+#       Vision: "A globally-influential faculty, enriching lives & shaping the future through computing technology"
+#       Mission: "To enrich lives and shape the future for the nation and humanity through education, research and technopreneurship"
+#   - PDF page 187 contains the undergraduate-style Vision/Mission page:
+#       Vision: "A global faculty impacting the world"
+#       Mission: "Propelling computing technology and producing world class leaders"
+#
+# The previous broken mapping pointed both PG and UG identity to the same page.
+# That caused the same answer to be returned for both questions.
+# ----------------------------
+COMPLETE_BLOCKS = [
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (186, 186)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Dean's Office and Management", "pages": (6, 8)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Artificial Intelligence", "pages": (9, 12)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Software Engineering", "pages": (13, 16)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Information Systems", "pages": (17, 20)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Legislation and Prescribed Rules", "pages": (126, 126)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Marking Scheme and Grade Point Average (GPA)", "pages": (127, 127)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Progress Report", "pages": (129, 129)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Supervision Policy for Postgraduate Programmes", "pages": (130, 137)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis Preparation Guidelines", "pages": (138, 171)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis or Dissertation Submission and Examinations", "pages": (172, 172)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Publication Requirement", "pages": (173, 175)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Plagiarism", "pages": (176, 176)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Intellectual Property", "pages": (177, 177)},
+    {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Postgraduate Activities", "pages": (178, 181)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Laboratory Regulations", "pages": (183, 183)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Technical Problem Enquiries", "pages": (184, 184)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (187, 187)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Undergraduate Dean's Office and Department Leadership", "pages": (192, 199)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Programmes", "subsection": "Programmes Offered", "pages": (200, 200)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "University Courses", "pages": (225, 227)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Faculty Core Courses", "pages": (228, 230)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Programme Core Courses", "pages": (231, 239)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Computer System and Network", "pages": (240, 244)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Artificial Intelligence", "pages": (245, 249)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Information Systems", "pages": (250, 254)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Software Engineering", "pages": (255, 259)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Multimedia Computing", "pages": (260, 264)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Data Science", "pages": (265, 268)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Industrial Training", "subsection": "Industrial Training Guidelines", "pages": (270, 280)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Academic Project", "subsection": "Academic Project I and II Guidelines", "pages": (282, 289)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Language Path and English Communication", "subsection": "Language Path Course / English Communication Programme 2025/2026", "pages": (292, 296)},
+    {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Student Dress Code", "subsection": "Dress Code and Appearance Guides for Universiti Malaya Students", "pages": (297, 298), "manual_text": DRESS_CODE_MANUAL_TEXT},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Rules and Regulations", "subsection": "Examination Honesty and Discipline / Undergraduate Rules", "pages": (299, 300)},
+    {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Examination Grading Scheme", "subsection": "Official University Grades", "pages": (301, 301)},
+]
+for code, name, pages in UG_PROGRAMMES:
+    COMPLETE_BLOCKS.append({
+        "source_doc": "Complete Handbook",
+        "scope_label": "undergraduate",
+        "section": "Undergraduate Programme Goals and Learning Outcomes",
+        "subsection": name,
+        "pages": pages,
+    })
+for programme in PG_PROGRAMMES:
+    for subsection, pages in programme["blocks"]:
+        COMPLETE_BLOCKS.append({
+            "source_doc": "Complete Handbook",
+            "scope_label": "postgraduate",
+            "section": programme["name"],
+            "subsection": subsection,
+            "pages": pages,
+        })