TensorCat commited on
Commit
052d67e
·
verified ·
1 Parent(s): 5bfa0f8

Upload 30 files

Browse files
Files changed (31) hide show
  1. .gitattributes +3 -0
  2. UM_Handbook/(Demo Pilot)FineTune_QWEN3_UM_Handbook_en.ipynb +1531 -0
  3. UM_Handbook/Dataset/Manual_Index/UM_Manual_Core_Question_Index.json +0 -0
  4. UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Draft_Build_Report.json +30 -0
  5. UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft.jsonl +0 -0
  6. UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft_pretty.json +0 -0
  7. UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft.jsonl +0 -0
  8. UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft_pretty.json +0 -0
  9. UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata.jsonl +0 -0
  10. UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata_pretty.json +0 -0
  11. UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl +0 -0
  12. UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready_pretty.json +0 -0
  13. UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl +0 -0
  14. UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_pretty.json +0 -0
  15. UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_report.json +14 -0
  16. UM_Handbook/Dataset/markdown/complete_handbook_structured.md +0 -0
  17. UM_Handbook/Dataset/markdown/general_handbook_structured.md +488 -0
  18. UM_Handbook/Dataset/pdf/Complete Handbook.pdf +3 -0
  19. UM_Handbook/Dataset/pdf/General Handbook.pdf +3 -0
  20. UM_Handbook/Dataset/reports/um_handbook_markdown_report.json +2771 -0
  21. UM_Handbook/FineTune_QWEN3_UM_Handbook_optimized_1.ipynb +0 -0
  22. UM_Handbook/UM_Handbook_Markdown_Preprocess.py +286 -0
  23. UM_Handbook/UM_SFT_QA_Dataset_Builder_from_Index.py +620 -0
  24. UM_Handbook/UM_Source_Chunk_Dataset_Builder.py +265 -0
  25. UM_Handbook/assets/TensorCat.png +0 -0
  26. UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/chat_template.jinja +89 -0
  27. UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/config.json +71 -0
  28. UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/generation_config.json +13 -0
  29. UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json +3 -0
  30. UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer_config.json +29 -0
  31. UM_Handbook/um_handbook_config.py +230 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ UM_Handbook/Dataset/pdf/Complete[[:space:]]Handbook.pdf filter=lfs diff=lfs merge=lfs -text
37
+ UM_Handbook/Dataset/pdf/General[[:space:]]Handbook.pdf filter=lfs diff=lfs merge=lfs -text
38
+ UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
UM_Handbook/(Demo Pilot)FineTune_QWEN3_UM_Handbook_en.ipynb ADDED
@@ -0,0 +1,1531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "ac09de66",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Qwen3-8B UM Handbook LoRA / QLoRA Fine-tuning\n",
9
+ "\n",
10
+ "This notebook keeps the training logic from `finetune_qwen3_um_handbook_v3.py` and organizes it into separate notebook sections for DICC.\n",
11
+ "\n",
12
+ "## Workflow\n",
13
+ "1. Check the environment and available devices\n",
14
+ "2. Read and validate `SFT_QA_Training_Ready.jsonl`\n",
15
+ "3. Convert the QA data into prompt-completion format\n",
16
+ "4. Split the data into training and validation sets\n",
17
+ "5. Download the Qwen3-8B base model into a local directory\n",
18
+ "6. Select the backend automatically: **CUDA > MPS > CPU**\n",
19
+ "7. Use **4-bit QLoRA** on CUDA and standard LoRA on MPS / CPU\n",
20
+ "8. Train with `TRL SFTTrainer`\n",
21
+ "9. Evaluate with both loss-based and generation-based metrics\n",
22
+ "10. Save the LoRA adapter, merged model, `.pt` file, metrics, and predictions\n",
23
+ "\n"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "id": "71f10012",
29
+ "metadata": {},
30
+ "source": [
31
+ "## Part 1 - Install dependencies\n",
32
+ "\n",
33
+ "Run this cell only if the current DICC kernel does not already have the required packages.\n",
34
+ "\n"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "id": "d091473c",
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "# %pip install -U torch transformers accelerate datasets trl peft bitsandbytes sentencepiece evaluate rouge_score bert_score sacrebleu huggingface_hub"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "markdown",
49
+ "id": "f44d18ff",
50
+ "metadata": {},
51
+ "source": [
52
+ "## Part 2 - Import libraries\n",
53
+ "\n",
54
+ "This section imports the libraries used in the training pipeline.\n",
55
+ "\n"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 26,
61
+ "id": "47a8b3f7",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "from __future__ import annotations\n",
66
+ "\n",
67
+ "import gc\n",
68
+ "import json\n",
69
+ "import math\n",
70
+ "import random\n",
71
+ "import re\n",
72
+ "import time\n",
73
+ "from pathlib import Path\n",
74
+ "from typing import Dict, List\n",
75
+ "from pathlib import Path\n",
76
+ "\n",
77
+ "import numpy as np\n",
78
+ "import torch\n",
79
+ "from datasets import Dataset, DatasetDict\n",
80
+ "from huggingface_hub import snapshot_download\n",
81
+ "from peft import LoraConfig, PeftModel\n",
82
+ "import transformers\n",
83
+ "from transformers import (\n",
84
+ " AutoModelForCausalLM,\n",
85
+ " AutoTokenizer,\n",
86
+ " BitsAndBytesConfig,\n",
87
+ " set_seed,\n",
88
+ ")\n",
89
+ "from trl import SFTConfig, SFTTrainer"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "id": "469e1466",
95
+ "metadata": {},
96
+ "source": [
97
+ "## Part 3 - Configuration\n",
98
+ "\n",
99
+ "This section defines project paths, model paths, output paths, and training hyperparameters.\n",
100
+ "\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 13,
106
+ "id": "59d68547",
107
+ "metadata": {},
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "PROJECT_ROOT = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook\n",
114
+ "DATASET_ROOT = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset\n",
115
+ "DATASET_PATH = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl\n",
116
+ "BASE_MODEL_DIR = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n",
117
+ "OUTPUT_ROOT = /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook\n",
118
+ "USE_4BIT = True\n",
119
+ "MAX_GRAD_NORM = 1.0\n",
120
+ "PACKING = False\n",
121
+ "GRADIENT_CHECKPOINTING = True\n",
122
+ "LOW_CPU_MEM_USAGE = True\n",
123
+ "USE_FLASH_ATTENTION_2_IF_AVAILABLE = False\n"
124
+ ]
125
+ }
126
+ ],
127
+ "source": [
128
+ "# ============================================================\n",
129
+ "# CONFIG\n",
130
+ "# ============================================================\n",
131
+ "\n",
132
+ "from pathlib import Path\n",
133
+ "\n",
134
+ "WARMUP_STEPS = 20\n",
135
+ "\n",
136
+ "# Project root\n",
137
+ "PROJECT_ROOT = Path(\"/scr/user/kevin2002/TensorCat/NLP/UM_Handbook\")\n",
138
+ "\n",
139
+ "# Dataset paths\n",
140
+ "DATASET_ROOT = PROJECT_ROOT / \"Dataset\" / \"SFT_Dataset\"\n",
141
+ "DATASET_PATH = DATASET_ROOT / \"SFT_QA_Training_Ready.jsonl\"\n",
142
+ "\n",
143
+ "# Base model selection\n",
144
+ "BASE_MODEL_NAME = \"Qwen/Qwen3-8B\"\n",
145
+ "BASE_MODEL_LOCAL_DIR = PROJECT_ROOT / \"models\" / \"Qwen3-8B\"\n",
146
+ "\n",
147
+ "# Output paths\n",
148
+ "OUTPUT_ROOT = PROJECT_ROOT / \"outputs\" / \"qwen3_um_handbook\"\n",
149
+ "ADAPTER_OUTPUT_DIR = OUTPUT_ROOT / \"lora_adapter\"\n",
150
+ "MERGED_MODEL_DIR = OUTPUT_ROOT / \"merged_model\"\n",
151
+ "FINAL_PT_PATH = OUTPUT_ROOT / \"Qwen3-8B-Instruct_UM_Handbook.pt\"\n",
152
+ "METRICS_JSON_PATH = OUTPUT_ROOT / \"final_metrics.json\"\n",
153
+ "PREDICTIONS_JSONL_PATH = OUTPUT_ROOT / \"validation_predictions.jsonl\"\n",
154
+ "TRAIN_VAL_SPLIT_JSON_PATH = OUTPUT_ROOT / \"dataset_split_summary.json\"\n",
155
+ "\n",
156
+ "# Data / prompt settings\n",
157
+ "SYSTEM_PROMPT = (\n",
158
+ " \"You are an academic assistant for the Faculty of Computer Science and \"\n",
159
+ " \"Information Technology, Universiti Malaya. Answer questions accurately \"\n",
160
+ " \"and only using handbook-consistent information. If the handbook does not support \"\n",
161
+ " \"a claim, avoid inventing details.\"\n",
162
+ ")\n",
163
+ "TRAIN_VAL_RATIO = 0.90\n",
164
+ "MAX_SEQ_LENGTH = 1024\n",
165
+ "RANDOM_SEED = 42\n",
166
+ "\n",
167
+ "# LoRA / QLoRA settings\n",
168
+ "USE_4BIT = True\n",
169
+ "LORA_R = 32\n",
170
+ "LORA_ALPHA = 64\n",
171
+ "LORA_DROPOUT = 0.05\n",
172
+ "LORA_TARGET_MODULES = [\n",
173
+ " \"q_proj\",\n",
174
+ " \"k_proj\",\n",
175
+ " \"v_proj\",\n",
176
+ " \"o_proj\",\n",
177
+ " \"gate_proj\",\n",
178
+ " \"up_proj\",\n",
179
+ " \"down_proj\",\n",
180
+ "]\n",
181
+ "\n",
182
+ "# Training settings\n",
183
+ "NUM_TRAIN_EPOCHS = 6\n",
184
+ "PER_DEVICE_TRAIN_BATCH_SIZE = 2\n",
185
+ "PER_DEVICE_EVAL_BATCH_SIZE = 2\n",
186
+ "GRADIENT_ACCUMULATION_STEPS = 8\n",
187
+ "LEARNING_RATE = 2e-4\n",
188
+ "WEIGHT_DECAY = 0.01\n",
189
+ "WARMUP_RATIO = 0.05\n",
190
+ "LOGGING_STEPS = 10\n",
191
+ "SAVE_STEPS = 50\n",
192
+ "EVAL_STEPS = 50\n",
193
+ "\n",
194
+ "MAX_GRAD_NORM = 1.0\n",
195
+ "PACKING = False\n",
196
+ "GRADIENT_CHECKPOINTING = True\n",
197
+ "LOW_CPU_MEM_USAGE = True\n",
198
+ "USE_FLASH_ATTENTION_2_IF_AVAILABLE = False\n",
199
+ "\n",
200
+ "# Save / display settings\n",
201
+ "SAVE_MERGED_MODEL = True\n",
202
+ "SAVE_TOKENIZER_WITH_MERGED = True\n",
203
+ "NUM_PRINTED_PREDICTIONS = 5\n",
204
+ "\n",
205
+ "\n",
206
+ "# Generation eval settings\n",
207
+ "MAX_NEW_TOKENS_EVAL = 192\n",
208
+ "NUM_EVAL_SAMPLES_FOR_GENERATION = None # None = use full validation set\n",
209
+ "DO_SAMPLE_EVAL = False\n",
210
+ "TEMPERATURE_EVAL = 0.7\n",
211
+ "TOP_P_EVAL = 0.9\n",
212
+ "\n",
213
+ "# Final export settings\n",
214
+ "SAVE_SINGLE_PT = True\n",
215
+ "\n",
216
+ "# Create the main directories early\n",
217
+ "for path in [PROJECT_ROOT, DATASET_ROOT, BASE_MODEL_LOCAL_DIR.parent, OUTPUT_ROOT]:\n",
218
+ " path.mkdir(parents=True, exist_ok=True)\n",
219
+ "\n",
220
+ "print(\"PROJECT_ROOT =\", PROJECT_ROOT)\n",
221
+ "print(\"DATASET_ROOT =\", DATASET_ROOT)\n",
222
+ "print(\"DATASET_PATH =\", DATASET_PATH)\n",
223
+ "print(\"BASE_MODEL_DIR =\", BASE_MODEL_LOCAL_DIR)\n",
224
+ "print(\"OUTPUT_ROOT =\", OUTPUT_ROOT)\n",
225
+ "print(\"USE_4BIT =\", USE_4BIT)\n",
226
+ "\n",
227
+ "\n",
228
+ "print(\"MAX_GRAD_NORM =\", MAX_GRAD_NORM)\n",
229
+ "print(\"PACKING =\", PACKING)\n",
230
+ "print(\"GRADIENT_CHECKPOINTING =\", GRADIENT_CHECKPOINTING)\n",
231
+ "print(\"LOW_CPU_MEM_USAGE =\", LOW_CPU_MEM_USAGE)\n",
232
+ "print(\"USE_FLASH_ATTENTION_2_IF_AVAILABLE =\", USE_FLASH_ATTENTION_2_IF_AVAILABLE)\n"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "markdown",
237
+ "id": "e9319138",
238
+ "metadata": {},
239
+ "source": [
240
+ "## Part 4 - Helper functions\n",
241
+ "\n",
242
+ "This section defines utility functions for paths, logging, text cleanup, device selection, dtype selection, and 4-bit control.\n",
243
+ "\n"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 14,
249
+ "id": "f4a0e4f4",
250
+ "metadata": {},
251
+ "outputs": [],
252
+ "source": [
253
+ "def ensure_dir(path: Path) -> None:\n",
254
+ " path.mkdir(parents=True, exist_ok=True)\n",
255
+ "\n",
256
+ "def print_banner(title: str) -> None:\n",
257
+ " print(\"\\n\" + \"=\" * 88)\n",
258
+ " print(title)\n",
259
+ " print(\"=\" * 88)\n",
260
+ "\n",
261
+ "def select_runtime_backend() -> str:\n",
262
+ " if torch.cuda.is_available():\n",
263
+ " return \"cuda\"\n",
264
+ " if hasattr(torch.backends, \"mps\") and torch.backends.mps.is_available():\n",
265
+ " return \"mps\"\n",
266
+ " return \"cpu\"\n",
267
+ "\n",
268
+ "def detect_compute_dtype(backend: str) -> torch.dtype:\n",
269
+ " if backend == \"cuda\":\n",
270
+ " if torch.cuda.is_bf16_supported():\n",
271
+ " return torch.bfloat16\n",
272
+ " return torch.float16\n",
273
+ " if backend == \"mps\":\n",
274
+ " return torch.float16\n",
275
+ " return torch.float32\n",
276
+ "\n",
277
+ "def should_use_4bit(backend: str) -> bool:\n",
278
+ " return USE_4BIT and backend == \"cuda\"\n",
279
+ "\n",
280
+ "def normalize_text(text: str) -> str:\n",
281
+ " text = text.strip()\n",
282
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
283
+ " return text\n",
284
+ "\n",
285
+ "def normalize_for_exact(text: str) -> str:\n",
286
+ " text = text.lower().strip()\n",
287
+ " text = re.sub(r\"[^\\w\\s]\", \" \", text)\n",
288
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
289
+ " return text\n",
290
+ "\n",
291
+ "def cleanup_memory() -> None:\n",
292
+ " gc.collect()\n",
293
+ " if torch.cuda.is_available():\n",
294
+ " torch.cuda.empty_cache()"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "markdown",
299
+ "id": "e8d9c423",
300
+ "metadata": {},
301
+ "source": [
302
+ "## Part 5 - Device detection and display\n",
303
+ "\n",
304
+ "This section prints the available devices, selected backend, selected dtype, and whether 4-bit loading is enabled.\n",
305
+ "\n"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": 15,
311
+ "id": "64398701",
312
+ "metadata": {},
313
+ "outputs": [
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "\n",
319
+ "========================================================================================\n",
320
+ "Device Detection\n",
321
+ "========================================================================================\n",
322
+ "{\n",
323
+ " \"selected_backend\": \"cuda\",\n",
324
+ " \"torch_dtype\": \"torch.bfloat16\",\n",
325
+ " \"use_4bit_qlora\": true,\n",
326
+ " \"cuda_available\": true,\n",
327
+ " \"mps_available\": false,\n",
328
+ " \"cuda_device_count\": 1,\n",
329
+ " \"cuda_device_name\": \"NVIDIA A100-SXM4-80GB\"\n",
330
+ "}\n"
331
+ ]
332
+ }
333
+ ],
334
+ "source": [
335
+ "print_banner(\"Device Detection\")\n",
336
+ "\n",
337
+ "RUNTIME_DEVICE_BACKEND = select_runtime_backend()\n",
338
+ "effective_dtype = detect_compute_dtype(RUNTIME_DEVICE_BACKEND)\n",
339
+ "effective_use_4bit = should_use_4bit(RUNTIME_DEVICE_BACKEND)\n",
340
+ "\n",
341
+ "device_info = {\n",
342
+ " \"selected_backend\": RUNTIME_DEVICE_BACKEND,\n",
343
+ " \"torch_dtype\": str(effective_dtype),\n",
344
+ " \"use_4bit_qlora\": effective_use_4bit,\n",
345
+ " \"cuda_available\": torch.cuda.is_available(),\n",
346
+ " \"mps_available\": hasattr(torch.backends, \"mps\") and torch.backends.mps.is_available(),\n",
347
+ "}\n",
348
+ "\n",
349
+ "if torch.cuda.is_available():\n",
350
+ " device_info[\"cuda_device_count\"] = torch.cuda.device_count()\n",
351
+ " try:\n",
352
+ " device_info[\"cuda_device_name\"] = torch.cuda.get_device_name(0)\n",
353
+ " except Exception:\n",
354
+ " device_info[\"cuda_device_name\"] = \"Unavailable\"\n",
355
+ "\n",
356
+ "print(json.dumps(device_info, indent=2))\n",
357
+ "\n",
358
+ "if RUNTIME_DEVICE_BACKEND != \"cuda\":\n",
359
+ " print(\n",
360
+ " \"\\\\n[Info] Non-CUDA backend detected. 4-bit bitsandbytes QLoRA is disabled automatically, \"\n",
361
+ " \"and the training path falls back to standard LoRA on the selected backend.\"\n",
362
+ " )"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "markdown",
367
+ "id": "69b02751",
368
+ "metadata": {},
369
+ "source": [
370
+ "## Part 6 - Evaluation functions\n",
371
+ "\n",
372
+ "This section defines exact-match and token-level F1 scoring functions.\n",
373
+ "\n"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": 16,
379
+ "id": "379c9dd7",
380
+ "metadata": {},
381
+ "outputs": [],
382
+ "source": [
383
+ "def token_f1(prediction: str, reference: str) -> float:\n",
384
+ " pred_tokens = normalize_for_exact(prediction).split()\n",
385
+ " ref_tokens = normalize_for_exact(reference).split()\n",
386
+ "\n",
387
+ " if not pred_tokens and not ref_tokens:\n",
388
+ " return 1.0\n",
389
+ " if not pred_tokens or not ref_tokens:\n",
390
+ " return 0.0\n",
391
+ "\n",
392
+ " common = {}\n",
393
+ " for token in pred_tokens:\n",
394
+ " common[token] = common.get(token, 0) + 1\n",
395
+ "\n",
396
+ " overlap = 0\n",
397
+ " ref_counts = {}\n",
398
+ " for token in ref_tokens:\n",
399
+ " ref_counts[token] = ref_counts.get(token, 0) + 1\n",
400
+ "\n",
401
+ " for token, count in common.items():\n",
402
+ " if token in ref_counts:\n",
403
+ " overlap += min(count, ref_counts[token])\n",
404
+ "\n",
405
+ " if overlap == 0:\n",
406
+ " return 0.0\n",
407
+ "\n",
408
+ " precision = overlap / len(pred_tokens)\n",
409
+ " recall = overlap / len(ref_tokens)\n",
410
+ " return 2 * precision * recall / (precision + recall)\n",
411
+ "\n",
412
+ "def exact_match(prediction: str, reference: str) -> float:\n",
413
+ " return float(normalize_for_exact(prediction) == normalize_for_exact(reference))"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "markdown",
418
+ "id": "d26e26fd",
419
+ "metadata": {},
420
+ "source": [
421
+ "## Part 7 - Dataset reading and validation functions\n",
422
+ "\n",
423
+ "This section reads the JSONL file, checks required fields, and prepares prompt-completion rows.\n",
424
+ "\n"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": 17,
430
+ "id": "56287c34",
431
+ "metadata": {},
432
+ "outputs": [],
433
+ "source": [
434
+ "def load_jsonl(path: Path) -> List[Dict]:\n",
435
+ " rows: List[Dict] = []\n",
436
+ " with path.open(\"r\", encoding=\"utf-8\") as f:\n",
437
+ " for line_number, line in enumerate(f, 1):\n",
438
+ " line = line.strip()\n",
439
+ " if not line:\n",
440
+ " continue\n",
441
+ " obj = json.loads(line)\n",
442
+ " required = {\"question\", \"answer\"}\n",
443
+ " missing = required - set(obj.keys())\n",
444
+ " if missing:\n",
445
+ " raise ValueError(f\"Line {line_number} is missing keys: {sorted(missing)}\")\n",
446
+ " obj[\"question\"] = normalize_text(obj[\"question\"])\n",
447
+ " obj[\"answer\"] = normalize_text(obj[\"answer\"])\n",
448
+ " rows.append(obj)\n",
449
+ "\n",
450
+ " if not rows:\n",
451
+ " raise ValueError(f\"Dataset at {path} is empty.\")\n",
452
+ " return rows\n",
453
+ "\n",
454
+ "def build_prompt_completion_rows(rows: List[Dict]) -> List[Dict]:\n",
455
+ " converted: List[Dict] = []\n",
456
+ " for row in rows:\n",
457
+ " converted.append(\n",
458
+ " {\n",
459
+ " \"qa_id\": row.get(\"qa_id\", \"\"),\n",
460
+ " \"index_id\": row.get(\"index_id\", \"\"),\n",
461
+ " \"prompt\": [\n",
462
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
463
+ " {\"role\": \"user\", \"content\": row[\"question\"]},\n",
464
+ " ],\n",
465
+ " \"completion\": [\n",
466
+ " {\"role\": \"assistant\", \"content\": row[\"answer\"]},\n",
467
+ " ],\n",
468
+ " \"question\": row[\"question\"],\n",
469
+ " \"answer\": row[\"answer\"],\n",
470
+ " }\n",
471
+ " )\n",
472
+ " return converted\n",
473
+ "\n",
474
+ "def split_dataset(rows: List[Dict], train_ratio: float, seed: int) -> DatasetDict:\n",
475
+ " rng = random.Random(seed)\n",
476
+ " rows = rows.copy()\n",
477
+ " rng.shuffle(rows)\n",
478
+ "\n",
479
+ " split_idx = max(1, int(len(rows) * train_ratio))\n",
480
+ " split_idx = min(split_idx, len(rows) - 1)\n",
481
+ "\n",
482
+ " train_rows = rows[:split_idx]\n",
483
+ " val_rows = rows[split_idx:]\n",
484
+ "\n",
485
+ " ds = DatasetDict(\n",
486
+ " {\n",
487
+ " \"train\": Dataset.from_list(train_rows),\n",
488
+ " \"validation\": Dataset.from_list(val_rows),\n",
489
+ " }\n",
490
+ " )\n",
491
+ " return ds"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "id": "53f03f17",
497
+ "metadata": {},
498
+ "source": [
499
+ "## Part 8 - Read and inspect the dataset\n",
500
+ "\n",
501
+ "This section loads the dataset, creates the train / validation split, and prints one sample.\n",
502
+ "\n"
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": 18,
508
+ "id": "943f8ae3",
509
+ "metadata": {},
510
+ "outputs": [
511
+ {
512
+ "name": "stdout",
513
+ "output_type": "stream",
514
+ "text": [
515
+ "\n",
516
+ "========================================================================================\n",
517
+ "Step 1 - Validate dataset\n",
518
+ "========================================================================================\n",
519
+ "{\n",
520
+ " \"dataset_path\": \"/scr/user/kevin2002/TensorCat/NLP/UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl\",\n",
521
+ " \"total_examples\": 388,\n",
522
+ " \"train_examples\": 349,\n",
523
+ " \"validation_examples\": 39,\n",
524
+ " \"seed\": 42,\n",
525
+ " \"train_val_ratio\": 0.9\n",
526
+ "}\n",
527
+ "\\nSample example:\n",
528
+ "{\n",
529
+ " \"qa_id\": \"qa_000001\",\n",
530
+ " \"index_id\": \"UMI-0001\",\n",
531
+ " \"prompt\": [\n",
532
+ " {\n",
533
+ " \"role\": \"system\",\n",
534
+ " \"content\": \"You are an academic assistant for the Faculty of Computer Science and Information Technology, Universiti Malaya. Answer questions accurately and only using handbook-consistent information. If the handbook does not support a claim, avoid inventing details.\"\n",
535
+ " },\n",
536
+ " {\n",
537
+ " \"role\": \"user\",\n",
538
+ " \"content\": \"What are the faculty objectives?\"\n",
539
+ " }\n",
540
+ " ],\n",
541
+ " \"completion\": [\n",
542
+ " {\n",
543
+ " \"role\": \"assistant\",\n",
544
+ " \"content\": \"The faculty objectives are to sustain excellence in undergraduate and postgraduate teaching, learning, and research; contribute to national development through quality research and publications; provide innovative academic programmes that respond to societal needs; and produce quality graduates with advanced knowledge and skills in computer science and information technology.\"\n",
545
+ " }\n",
546
+ " ],\n",
547
+ " \"question\": \"What are the faculty objectives?\",\n",
548
+ " \"answer\": \"The faculty objectives are to sustain excellence in undergraduate and postgraduate teaching, learning, and research; contribute to national development through quality research and publications; provide innovative academic programmes that respond to societal needs; and produce quality graduates with advanced knowledge and skills in computer science and information technology.\"\n",
549
+ "}\n"
550
+ ]
551
+ }
552
+ ],
553
+ "source": [
554
+ "print_banner(\"Step 1 - Validate dataset\")\n",
555
+ "\n",
556
+ "if not DATASET_PATH.exists():\n",
557
+ " raise FileNotFoundError(\n",
558
+ " f\"Dataset not found: {DATASET_PATH}\\n\"\n",
559
+ " f\"Place SFT_QA_Training_Ready.jsonl in DATASET_ROOT or update DATASET_PATH.\"\n",
560
+ " )\n",
561
+ "\n",
562
+ "set_seed(RANDOM_SEED)\n",
563
+ "random.seed(RANDOM_SEED)\n",
564
+ "np.random.seed(RANDOM_SEED)\n",
565
+ "\n",
566
+ "raw_rows = load_jsonl(DATASET_PATH)\n",
567
+ "converted_rows = build_prompt_completion_rows(raw_rows)\n",
568
+ "dataset_dict = split_dataset(converted_rows, TRAIN_VAL_RATIO, RANDOM_SEED)\n",
569
+ "\n",
570
+ "split_summary = {\n",
571
+ " \"dataset_path\": str(DATASET_PATH),\n",
572
+ " \"total_examples\": len(converted_rows),\n",
573
+ " \"train_examples\": len(dataset_dict[\"train\"]),\n",
574
+ " \"validation_examples\": len(dataset_dict[\"validation\"]),\n",
575
+ " \"seed\": RANDOM_SEED,\n",
576
+ " \"train_val_ratio\": TRAIN_VAL_RATIO,\n",
577
+ "}\n",
578
+ "\n",
579
+ "print(json.dumps(split_summary, indent=2, ensure_ascii=False))\n",
580
+ "print(\"\\\\nSample example:\")\n",
581
+ "print(json.dumps(converted_rows[0], indent=2, ensure_ascii=False)[:1800])"
582
+ ]
583
+ },
584
+ {
585
+ "cell_type": "markdown",
586
+ "id": "1589862e",
587
+ "metadata": {},
588
+ "source": [
589
+ "## Part 9 - Save the dataset split summary\n",
590
+ "\n",
591
+ "This section writes the split summary to JSON.\n",
592
+ "\n"
593
+ ]
594
+ },
595
+ {
596
+ "cell_type": "code",
597
+ "execution_count": 19,
598
+ "id": "61b5ae46",
599
+ "metadata": {},
600
+ "outputs": [
601
+ {
602
+ "name": "stdout",
603
+ "output_type": "stream",
604
+ "text": [
605
+ "Saved split summary to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/dataset_split_summary.json\n"
606
+ ]
607
+ }
608
+ ],
609
+ "source": [
610
+ "def save_json(path: Path, obj: Dict) -> None:\n",
611
+ " ensure_dir(path.parent)\n",
612
+ " with path.open(\"w\", encoding=\"utf-8\") as f:\n",
613
+ " json.dump(obj, f, indent=2, ensure_ascii=False)\n",
614
+ "\n",
615
+ "def save_predictions_jsonl(path: Path, rows: List[Dict]) -> None:\n",
616
+ " ensure_dir(path.parent)\n",
617
+ " with path.open(\"w\", encoding=\"utf-8\") as f:\n",
618
+ " for row in rows:\n",
619
+ " f.write(json.dumps(row, ensure_ascii=False) + \"\\\\n\")\n",
620
+ "\n",
621
+ "ensure_dir(OUTPUT_ROOT)\n",
622
+ "save_json(TRAIN_VAL_SPLIT_JSON_PATH, split_summary)\n",
623
+ "print(f\"Saved split summary to: {TRAIN_VAL_SPLIT_JSON_PATH}\")"
624
+ ]
625
+ },
626
+ {
627
+ "cell_type": "markdown",
628
+ "id": "79e53130",
629
+ "metadata": {},
630
+ "source": [
631
+ "## Part 10 - Download the base model into a local directory\n",
632
+ "\n",
633
+ "This section reuses the local model if it already exists; otherwise it downloads the base model to the configured model directory.\n",
634
+ "\n"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 20,
640
+ "id": "73f9a585",
641
+ "metadata": {},
642
+ "outputs": [
643
+ {
644
+ "name": "stdout",
645
+ "output_type": "stream",
646
+ "text": [
647
+ "Base model already exists at: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n",
648
+ "Local model path: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/models/Qwen3-8B\n"
649
+ ]
650
+ }
651
+ ],
652
+ "source": [
653
+ "def download_base_model_if_needed() -> Path:\n",
654
+ " ensure_dir(BASE_MODEL_LOCAL_DIR)\n",
655
+ "\n",
656
+ " if (BASE_MODEL_LOCAL_DIR / \"config.json\").exists():\n",
657
+ " print(f\"Base model already exists at: {BASE_MODEL_LOCAL_DIR}\")\n",
658
+ " return BASE_MODEL_LOCAL_DIR\n",
659
+ "\n",
660
+ " print_banner(\"Downloading base model snapshot\")\n",
661
+ " local_path = snapshot_download(\n",
662
+ " repo_id=BASE_MODEL_NAME,\n",
663
+ " local_dir=str(BASE_MODEL_LOCAL_DIR),\n",
664
+ " local_dir_use_symlinks=False,\n",
665
+ " resume_download=True,\n",
666
+ " )\n",
667
+ " print(f\"Downloaded base model to: {local_path}\")\n",
668
+ " return BASE_MODEL_LOCAL_DIR\n",
669
+ "\n",
670
+ "local_model_path = download_base_model_if_needed()\n",
671
+ "print(f\"Local model path: {local_model_path}\")"
672
+ ]
673
+ },
674
+ {
675
+ "cell_type": "markdown",
676
+ "id": "94be6680",
677
+ "metadata": {},
678
+ "source": [
679
+ "## Part 11 - Load the tokenizer and training model\n",
680
+ "\n",
681
+ "This section loads the tokenizer and model, enables 4-bit QLoRA on CUDA, and falls back to standard LoRA on MPS / CPU.\n",
682
+ "\n"
683
+ ]
684
+ },
685
+ {
686
+ "cell_type": "code",
687
+ "execution_count": 21,
688
+ "id": "b6771a4e",
689
+ "metadata": {},
690
+ "outputs": [
691
+ {
692
+ "name": "stdout",
693
+ "output_type": "stream",
694
+ "text": [
695
+ "\n",
696
+ "========================================================================================\n",
697
+ "Step 4 - Load tokenizer and model\n",
698
+ "========================================================================================\n"
699
+ ]
700
+ },
701
+ {
702
+ "data": {
703
+ "application/vnd.jupyter.widget-view+json": {
704
+ "model_id": "c8e4cf58c2d243ddb9fbf786788b5037",
705
+ "version_major": 2,
706
+ "version_minor": 0
707
+ },
708
+ "text/plain": [
709
+ "Loading weights: 0%| | 0/399 [00:00<?, ?it/s]"
710
+ ]
711
+ },
712
+ "metadata": {},
713
+ "output_type": "display_data"
714
+ },
715
+ {
716
+ "name": "stdout",
717
+ "output_type": "stream",
718
+ "text": [
719
+ "Tokenizer and model loaded successfully.\n",
720
+ "Model class: Qwen3ForCausalLM\n"
721
+ ]
722
+ }
723
+ ],
724
+ "source": [
725
+ "def load_tokenizer(model_path: Path):\n",
726
+ " tokenizer = AutoTokenizer.from_pretrained(str(model_path), use_fast=True)\n",
727
+ " if tokenizer.pad_token is None:\n",
728
+ " tokenizer.pad_token = tokenizer.eos_token\n",
729
+ " tokenizer.padding_side = \"right\"\n",
730
+ " return tokenizer\n",
731
+ "\n",
732
+ "def load_model_for_training(model_path: Path, backend: str):\n",
733
+ " compute_dtype = detect_compute_dtype(backend)\n",
734
+ "\n",
735
+ " quantization_config = None\n",
736
+ " if should_use_4bit(backend):\n",
737
+ " quantization_config = BitsAndBytesConfig(\n",
738
+ " load_in_4bit=True,\n",
739
+ " bnb_4bit_use_double_quant=True,\n",
740
+ " bnb_4bit_quant_type=\"nf4\",\n",
741
+ " bnb_4bit_compute_dtype=compute_dtype,\n",
742
+ " )\n",
743
+ "\n",
744
+ " model_kwargs = {\n",
745
+ " \"pretrained_model_name_or_path\": str(model_path),\n",
746
+ " \"torch_dtype\": compute_dtype,\n",
747
+ " \"low_cpu_mem_usage\": LOW_CPU_MEM_USAGE,\n",
748
+ " \"trust_remote_code\": False,\n",
749
+ " }\n",
750
+ "\n",
751
+ " if backend == \"cuda\":\n",
752
+ " model_kwargs[\"device_map\"] = \"auto\"\n",
753
+ " if quantization_config is not None:\n",
754
+ " model_kwargs[\"quantization_config\"] = quantization_config\n",
755
+ " if USE_FLASH_ATTENTION_2_IF_AVAILABLE and backend == \"cuda\":\n",
756
+ " model_kwargs[\"attn_implementation\"] = \"flash_attention_2\"\n",
757
+ "\n",
758
+ " model = AutoModelForCausalLM.from_pretrained(**model_kwargs)\n",
759
+ "\n",
760
+ " if backend in {\"mps\", \"cpu\"}:\n",
761
+ " model = model.to(backend)\n",
762
+ "\n",
763
+ " model.config.use_cache = False if GRADIENT_CHECKPOINTING else True\n",
764
+ " if GRADIENT_CHECKPOINTING:\n",
765
+ " model.gradient_checkpointing_enable()\n",
766
+ "\n",
767
+ " return model\n",
768
+ "\n",
769
+ "print_banner(\"Step 4 - Load tokenizer and model\")\n",
770
+ "tokenizer = load_tokenizer(local_model_path)\n",
771
+ "model = load_model_for_training(local_model_path, RUNTIME_DEVICE_BACKEND)\n",
772
+ "\n",
773
+ "print(\"Tokenizer and model loaded successfully.\")\n",
774
+ "print(\"Model class:\", model.__class__.__name__)"
775
+ ]
776
+ },
777
+ {
778
+ "cell_type": "markdown",
779
+ "id": "192160bb",
780
+ "metadata": {},
781
+ "source": [
782
+ "## Part 12 - Build the LoRA configuration and training arguments\n",
783
+ "\n",
784
+ "This section defines the LoRA settings and trainer arguments.\n",
785
+ "\n"
786
+ ]
787
+ },
788
+ {
789
+ "cell_type": "code",
790
+ "execution_count": 22,
791
+ "id": "38a4cc45",
792
+ "metadata": {},
793
+ "outputs": [
794
+ {
795
+ "name": "stdout",
796
+ "output_type": "stream",
797
+ "text": [
798
+ "\n",
799
+ "========================================================================================\n",
800
+ "Step 5 - Build trainer\n",
801
+ "========================================================================================\n"
802
+ ]
803
+ },
804
+ {
805
+ "data": {
806
+ "application/vnd.jupyter.widget-view+json": {
807
+ "model_id": "2c1da0d8fb454d9bb3ca83a8f871a8b5",
808
+ "version_major": 2,
809
+ "version_minor": 0
810
+ },
811
+ "text/plain": [
812
+ "Tokenizing train dataset (num_proc=1): 0%| | 0/349 [00:00<?, ? examples/s]"
813
+ ]
814
+ },
815
+ "metadata": {},
816
+ "output_type": "display_data"
817
+ },
818
+ {
819
+ "data": {
820
+ "application/vnd.jupyter.widget-view+json": {
821
+ "model_id": "e0b96c79ace94af39e6018adea9da222",
822
+ "version_major": 2,
823
+ "version_minor": 0
824
+ },
825
+ "text/plain": [
826
+ "Tokenizing eval dataset (num_proc=1): 0%| | 0/39 [00:00<?, ? examples/s]"
827
+ ]
828
+ },
829
+ "metadata": {},
830
+ "output_type": "display_data"
831
+ },
832
+ {
833
+ "name": "stdout",
834
+ "output_type": "stream",
835
+ "text": [
836
+ "Trainer built successfully.\n"
837
+ ]
838
+ }
839
+ ],
840
+ "source": [
841
+ "def build_peft_config() -> LoraConfig:\n",
842
+ " return LoraConfig(\n",
843
+ " r=LORA_R,\n",
844
+ " lora_alpha=LORA_ALPHA,\n",
845
+ " lora_dropout=LORA_DROPOUT,\n",
846
+ " bias=\"none\",\n",
847
+ " task_type=\"CAUSAL_LM\",\n",
848
+ " target_modules=LORA_TARGET_MODULES,\n",
849
+ " )\n",
850
+ "\n",
851
+ "def build_training_args(backend: str) -> SFTConfig:\n",
852
+ " bf16 = backend == \"cuda\" and torch.cuda.is_bf16_supported()\n",
853
+ " fp16 = backend in {\"cuda\", \"mps\"} and not bf16\n",
854
+ "\n",
855
+ " return SFTConfig(\n",
856
+ " output_dir=str(OUTPUT_ROOT / \"trainer_runs\"),\n",
857
+ " num_train_epochs=NUM_TRAIN_EPOCHS,\n",
858
+ " per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,\n",
859
+ " per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,\n",
860
+ " gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
861
+ " learning_rate=LEARNING_RATE,\n",
862
+ " weight_decay=WEIGHT_DECAY,\n",
863
+ " warmup_steps=WARMUP_STEPS,\n",
864
+ " logging_steps=LOGGING_STEPS,\n",
865
+ " eval_strategy=\"steps\",\n",
866
+ " eval_steps=EVAL_STEPS,\n",
867
+ " save_strategy=\"steps\",\n",
868
+ " save_steps=SAVE_STEPS,\n",
869
+ " save_total_limit=2,\n",
870
+ " load_best_model_at_end=True,\n",
871
+ " metric_for_best_model=\"eval_loss\",\n",
872
+ " greater_is_better=False,\n",
873
+ " max_grad_norm=MAX_GRAD_NORM,\n",
874
+ " lr_scheduler_type=\"cosine\",\n",
875
+ " bf16=bf16,\n",
876
+ " fp16=fp16,\n",
877
+ " gradient_checkpointing=GRADIENT_CHECKPOINTING,\n",
878
+ " max_length=MAX_SEQ_LENGTH,\n",
879
+ " packing=PACKING,\n",
880
+ " dataset_num_proc=1,\n",
881
+ " completion_only_loss=True,\n",
882
+ " remove_unused_columns=False,\n",
883
+ " report_to=\"none\",\n",
884
+ " seed=RANDOM_SEED,\n",
885
+ " optim=\"paged_adamw_8bit\" if should_use_4bit(backend) else \"adamw_torch\",\n",
886
+ " )\n",
887
+ "\n",
888
+ "def build_trainer(model, tokenizer, dataset_dict: DatasetDict, backend: str) -> SFTTrainer:\n",
889
+ " peft_config = build_peft_config()\n",
890
+ " training_args = build_training_args(backend)\n",
891
+ "\n",
892
+ " trainer = SFTTrainer(\n",
893
+ " model=model,\n",
894
+ " processing_class=tokenizer,\n",
895
+ " args=training_args,\n",
896
+ " train_dataset=dataset_dict[\"train\"],\n",
897
+ " eval_dataset=dataset_dict[\"validation\"],\n",
898
+ " peft_config=peft_config,\n",
899
+ " )\n",
900
+ " return trainer\n",
901
+ "\n",
902
+ "print_banner(\"Step 5 - Build trainer\")\n",
903
+ "trainer = build_trainer(model, tokenizer, dataset_dict, RUNTIME_DEVICE_BACKEND)\n",
904
+ "print(\"Trainer built successfully.\")"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "markdown",
909
+ "id": "b7892f31",
910
+ "metadata": {},
911
+ "source": [
912
+ "## Part 13 - Start training\n",
913
+ "\n",
914
+ "This section starts fine-tuning.\n",
915
+ "\n"
916
+ ]
917
+ },
918
+ {
919
+ "cell_type": "code",
920
+ "execution_count": 24,
921
+ "id": "fee6890b",
922
+ "metadata": {},
923
+ "outputs": [
924
+ {
925
+ "name": "stdout",
926
+ "output_type": "stream",
927
+ "text": [
928
+ "\n",
929
+ "========================================================================================\n",
930
+ "Step 6 - Train\n",
931
+ "========================================================================================\n"
932
+ ]
933
+ },
934
+ {
935
+ "data": {
936
+ "text/html": [
937
+ "\n",
938
+ " <div>\n",
939
+ " \n",
940
+ " <progress value='132' max='132' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
941
+ " [132/132 11:01, Epoch 6/6]\n",
942
+ " </div>\n",
943
+ " <table border=\"1\" class=\"dataframe\">\n",
944
+ " <thead>\n",
945
+ " <tr style=\"text-align: left;\">\n",
946
+ " <th>Step</th>\n",
947
+ " <th>Training Loss</th>\n",
948
+ " <th>Validation Loss</th>\n",
949
+ " </tr>\n",
950
+ " </thead>\n",
951
+ " <tbody>\n",
952
+ " <tr>\n",
953
+ " <td>50</td>\n",
954
+ " <td>0.300081</td>\n",
955
+ " <td>1.132511</td>\n",
956
+ " </tr>\n",
957
+ " <tr>\n",
958
+ " <td>100</td>\n",
959
+ " <td>0.025233</td>\n",
960
+ " <td>1.302579</td>\n",
961
+ " </tr>\n",
962
+ " </tbody>\n",
963
+ "</table><p>"
964
+ ],
965
+ "text/plain": [
966
+ "<IPython.core.display.HTML object>"
967
+ ]
968
+ },
969
+ "metadata": {},
970
+ "output_type": "display_data"
971
+ },
972
+ {
973
+ "name": "stdout",
974
+ "output_type": "stream",
975
+ "text": [
976
+ "Train metrics:\n",
977
+ "{\n",
978
+ " \"train_runtime\": 666.4624,\n",
979
+ " \"train_samples_per_second\": 3.142,\n",
980
+ " \"train_steps_per_second\": 0.198,\n",
981
+ " \"total_flos\": 1.4419846776152064e+16,\n",
982
+ " \"train_loss\": 0.26707774019715463\n",
983
+ "}\n",
984
+ "Adapter saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/lora_adapter\n",
985
+ "Train stage minutes: 11.13\n"
986
+ ]
987
+ }
988
+ ],
989
+ "source": [
990
+ "print_banner(\"Step 6 - Train\")\n",
991
+ "train_start_time = time.time()\n",
992
+ "\n",
993
+ "train_result = trainer.train()\n",
994
+ "trainer.save_model(str(ADAPTER_OUTPUT_DIR))\n",
995
+ "tokenizer.save_pretrained(str(ADAPTER_OUTPUT_DIR))\n",
996
+ "\n",
997
+ "train_metrics = train_result.metrics\n",
998
+ "print(\"Train metrics:\")\n",
999
+ "print(json.dumps(train_metrics, indent=2, default=str))\n",
1000
+ "\n",
1001
+ "print(f\"Adapter saved to: {ADAPTER_OUTPUT_DIR}\")\n",
1002
+ "print(f\"Train stage minutes: {(time.time() - train_start_time)/60:.2f}\")"
1003
+ ]
1004
+ },
1005
+ {
1006
+ "cell_type": "markdown",
1007
+ "id": "1a64c5b7",
1008
+ "metadata": {},
1009
+ "source": [
1010
+ "## Part 14 - Teacher-forced loss evaluation\n",
1011
+ "\n",
1012
+ "This section computes `eval_loss` and `perplexity` on the validation set.\n",
1013
+ "\n"
1014
+ ]
1015
+ },
1016
+ {
1017
+ "cell_type": "code",
1018
+ "execution_count": 27,
1019
+ "id": "7ceb96b4",
1020
+ "metadata": {},
1021
+ "outputs": [
1022
+ {
1023
+ "name": "stdout",
1024
+ "output_type": "stream",
1025
+ "text": [
1026
+ "\n",
1027
+ "========================================================================================\n",
1028
+ "Step 7 - Evaluate teacher-forced loss\n",
1029
+ "========================================================================================\n",
1030
+ "Eval loss : 1.132510781288147\n",
1031
+ "Perplexity : 3.1034387822556253\n",
1032
+ "{'eval_loss': 1.132510781288147, 'eval_runtime': 2.8459, 'eval_samples_per_second': 13.704, 'eval_steps_per_second': 7.028}\n"
1033
+ ]
1034
+ }
1035
+ ],
1036
+ "source": [
1037
+ "print_banner(\"Step 7 - Evaluate teacher-forced loss\")\n",
1038
+ "\n",
1039
+ "# Remove notebook progress callback to avoid Jupyter evaluate callback error\n",
1040
+ "trainer.remove_callback(transformers.utils.notebook.NotebookProgressCallback)\n",
1041
+ "\n",
1042
+ "eval_metrics = trainer.evaluate()\n",
1043
+ "eval_loss = float(eval_metrics.get(\"eval_loss\", float(\"nan\")))\n",
1044
+ "perplexity = float(math.exp(min(eval_loss, 20))) if math.isfinite(eval_loss) else float(\"nan\")\n",
1045
+ "\n",
1046
+ "print(\"Eval loss :\", eval_loss)\n",
1047
+ "print(\"Perplexity :\", perplexity)\n",
1048
+ "print(eval_metrics)"
1049
+ ]
1050
+ },
1051
+ {
1052
+ "cell_type": "markdown",
1053
+ "id": "0da67d71",
1054
+ "metadata": {},
1055
+ "source": [
1056
+ "## Part 15 - Generation evaluation functions\n",
1057
+ "\n",
1058
+ "This section defines generation-based evaluation functions and metric calculation.\n",
1059
+ "\n"
1060
+ ]
1061
+ },
1062
+ {
1063
+ "cell_type": "code",
1064
+ "execution_count": 33,
1065
+ "id": "eff9034b",
1066
+ "metadata": {},
1067
+ "outputs": [],
1068
+ "source": [
1069
+ "def format_eval_prompt(tokenizer, question: str) -> str:\n",
1070
+ " messages = [\n",
1071
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
1072
+ " {\"role\": \"user\", \"content\": question},\n",
1073
+ " ]\n",
1074
+ " return tokenizer.apply_chat_template(\n",
1075
+ " messages,\n",
1076
+ " tokenize=False,\n",
1077
+ " add_generation_prompt=True,\n",
1078
+ " )\n",
1079
+ "\n",
1080
+ "@torch.inference_mode()\n",
1081
+ "def generate_answers(model, tokenizer, questions: List[str], max_new_tokens: int) -> List[str]:\n",
1082
+ " device = next(model.parameters()).device\n",
1083
+ " prompts = [format_eval_prompt(tokenizer, q) for q in questions]\n",
1084
+ " outputs: List[str] = []\n",
1085
+ "\n",
1086
+ " for prompt in prompts:\n",
1087
+ " encoded = tokenizer(\n",
1088
+ " prompt,\n",
1089
+ " return_tensors=\"pt\",\n",
1090
+ " truncation=True,\n",
1091
+ " max_length=MAX_SEQ_LENGTH,\n",
1092
+ " )\n",
1093
+ " encoded = {k: v.to(device) for k, v in encoded.items()}\n",
1094
+ "\n",
1095
+ " generated = model.generate(\n",
1096
+ " **encoded,\n",
1097
+ " max_new_tokens=max_new_tokens,\n",
1098
+ " do_sample=False,\n",
1099
+ " temperature=None,\n",
1100
+ " top_p=None,\n",
1101
+ " repetition_penalty=1.05,\n",
1102
+ " pad_token_id=tokenizer.pad_token_id,\n",
1103
+ " eos_token_id=tokenizer.eos_token_id,\n",
1104
+ " )\n",
1105
+ "\n",
1106
+ " gen_only = generated[0][encoded[\"input_ids\"].shape[1]:]\n",
1107
+ " text = tokenizer.decode(gen_only, skip_special_tokens=True)\n",
1108
+ " outputs.append(normalize_text(text))\n",
1109
+ "\n",
1110
+ " return outputs\n",
1111
+ "\n",
1112
+ "def compute_generation_metrics(predictions: List[str], references: List[str]) -> Dict[str, float]:\n",
1113
+ " import evaluate\n",
1114
+ " import sacrebleu\n",
1115
+ "\n",
1116
+ " rouge = evaluate.load(\"rouge\")\n",
1117
+ " \n",
1118
+ "\n",
1119
+ " rouge_scores = rouge.compute(predictions=predictions, references=references)\n",
1120
+ " \n",
1121
+ "\n",
1122
+ " sacrebleu_score = sacrebleu.corpus_bleu(predictions, [references]).score\n",
1123
+ " chrf_score = sacrebleu.corpus_chrf(predictions, [references], word_order=2).score\n",
1124
+ "\n",
1125
+ " em = float(np.mean([exact_match(p, r) for p, r in zip(predictions, references)]))\n",
1126
+ " tf1 = float(np.mean([token_f1(p, r) for p, r in zip(predictions, references)]))\n",
1127
+ " avg_pred_len = float(np.mean([len(p.split()) for p in predictions])) if predictions else 0.0\n",
1128
+ " avg_ref_len = float(np.mean([len(r.split()) for r in references])) if references else 0.0\n",
1129
+ "\n",
1130
+ " metrics = {\n",
1131
+ " \"exact_match\": em,\n",
1132
+ " \"token_f1\": tf1,\n",
1133
+ " \"rouge1\": float(rouge_scores[\"rouge1\"]),\n",
1134
+ " \"rouge2\": float(rouge_scores[\"rouge2\"]),\n",
1135
+ " \"rougeL\": float(rouge_scores[\"rougeL\"]),\n",
1136
+ " \"bertscore_f1\": None,\n",
1137
+ " \"sacrebleu\": float(sacrebleu_score),\n",
1138
+ " \"chrf_pp\": float(chrf_score),\n",
1139
+ " \"avg_prediction_words\": avg_pred_len,\n",
1140
+ " \"avg_reference_words\": avg_ref_len,\n",
1141
+ " }\n",
1142
+ " return metrics"
1143
+ ]
1144
+ },
1145
+ {
1146
+ "cell_type": "markdown",
1147
+ "id": "d9298597",
1148
+ "metadata": {},
1149
+ "source": [
1150
+ "## Part 16 - Run final generation evaluation on the validation set\n",
1151
+ "\n",
1152
+ "This section generates answers on the validation set, computes metrics, saves predictions, and prints a few samples.\n",
1153
+ "\n"
1154
+ ]
1155
+ },
1156
+ {
1157
+ "cell_type": "code",
1158
+ "execution_count": 35,
1159
+ "id": "995cd0ee",
1160
+ "metadata": {},
1161
+ "outputs": [
1162
+ {
1163
+ "name": "stdout",
1164
+ "output_type": "stream",
1165
+ "text": [
1166
+ "\n",
1167
+ "========================================================================================\n",
1168
+ "Step 8 - Final generation evaluation on validation split\n",
1169
+ "========================================================================================\n",
1170
+ "Generation metrics:\n",
1171
+ "{\n",
1172
+ " \"exact_match\": 0.0,\n",
1173
+ " \"token_f1\": 0.5181110282406411,\n",
1174
+ " \"rouge1\": 0.5171361078676141,\n",
1175
+ " \"rouge2\": 0.33460021476687485,\n",
1176
+ " \"rougeL\": 0.45557456154376447,\n",
1177
+ " \"bertscore_f1\": null,\n",
1178
+ " \"sacrebleu\": 31.010919781258593,\n",
1179
+ " \"chrf_pp\": 49.920320261813664,\n",
1180
+ " \"avg_prediction_words\": 36.0,\n",
1181
+ " \"avg_reference_words\": 36.69230769230769\n",
1182
+ "}\n",
1183
+ "\n",
1184
+ "========================================================================================\n",
1185
+ "Sample validation predictions\n",
1186
+ "========================================================================================\n",
1187
+ "\n",
1188
+ "[Q] What courses are listed under programme core courses?\n",
1189
+ "[REF] The handbook presents programme core courses as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for the programme core component.\n",
1190
+ "[PRED] <think> </think> The handbook presents programme core courses as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for that part of the programme.\n",
1191
+ "[EM=0, TokenF1=0.8772]\n",
1192
+ "\n",
1193
+ "[Q] What responsibilities do supervisors have for guidance, feedback, and assessment in the academic project?\n",
1194
+ "[REF] Supervisors are responsible for assigning or confirming project titles in the ilmiah system, supervising and coaching students, meeting them regularly, verifying the progress logbook, evaluating reports, and entering marks in the ilmiah system. More broadly, the handbook frames supervisors as one of the key parties responsible for guidance, monitoring, feedback, and assessment in the academic project process.\n",
1195
+ "[PRED] <think> </think> Supervisors are responsible for providing regular guidance, giving feedback on progress and submissions, carrying out viva or panel-based assessment where applicable, and submitting the final report through the ilmiah system for administration to issue the mark.\n",
1196
+ "[EM=0, TokenF1=0.3542]\n",
1197
+ "\n",
1198
+ "[Q] What information is given about the faculty cafeteria?\n",
1199
+ "[REF] The cafeteria is located at the back of Block A.\n",
1200
+ "[PRED] <think> </think> The handbook states that the faculty cafeteria serves both staff and students and that it is open to all UM students.\n",
1201
+ "[EM=0, TokenF1=0.2424]\n",
1202
+ "\n",
1203
+ "[Q] What dress expectations are illustrated for official events?\n",
1204
+ "[REF] For official events, the poster illustrates formal attire, including suit-style clothing and traditional formal wear, to convey a neat and official appearance appropriate for formal university occasions.\n",
1205
+ "[PRED] <think> </think> For official events, men are expected to follow formal or semi-formal Western business attire, while women should also aim for formal or appropriate Western office or ceremonial clothing.\n",
1206
+ "[EM=0, TokenF1=0.3729]\n",
1207
+ "\n",
1208
+ "[Q] What courses are listed under specialization elective courses - artificial intelligence?\n",
1209
+ "[REF] The handbook presents the Artificial Intelligence specialization electives as a curriculum table listing the approved course codes, course titles, credit values, and semester arrangement for that specialization.\n",
1210
+ "[PRED] <think> </think> The specialization elective section is intended to show the elective pool available for that track. Students should use it as a selection list of approved course codes they can choose from, following the shown curriculum structure and any stated university or faculty rules for that programme.\n",
1211
+ "[EM=0, TokenF1=0.3467]\n"
1212
+ ]
1213
+ }
1214
+ ],
1215
+ "source": [
1216
+ "print_banner(\"Step 8 - Final generation evaluation on validation split\")\n",
1217
+ "\n",
1218
+ "final_metrics = {\n",
1219
+ " \"teacher_forced_eval\": eval_metrics,\n",
1220
+ " \"perplexity\": perplexity,\n",
1221
+ "}\n",
1222
+ "\n",
1223
+ "prediction_rows = []\n",
1224
+ "\n",
1225
+ "validation_questions = dataset_dict[\"validation\"][\"question\"]\n",
1226
+ "validation_answers = dataset_dict[\"validation\"][\"answer\"]\n",
1227
+ "\n",
1228
+ "predictions = generate_answers(\n",
1229
+ " model=trainer.model,\n",
1230
+ " tokenizer=tokenizer,\n",
1231
+ " questions=validation_questions,\n",
1232
+ " max_new_tokens=MAX_NEW_TOKENS_EVAL,\n",
1233
+ ")\n",
1234
+ "\n",
1235
+ "generation_metrics = compute_generation_metrics(predictions, validation_answers)\n",
1236
+ "final_metrics[\"generation_metrics\"] = generation_metrics\n",
1237
+ "\n",
1238
+ "for i, (question, reference, prediction) in enumerate(\n",
1239
+ " zip(validation_questions, validation_answers, predictions)\n",
1240
+ "):\n",
1241
+ " prediction_rows.append(\n",
1242
+ " {\n",
1243
+ " \"row_id\": i,\n",
1244
+ " \"question\": question,\n",
1245
+ " \"reference_answer\": reference,\n",
1246
+ " \"predicted_answer\": prediction,\n",
1247
+ " \"exact_match\": exact_match(prediction, reference),\n",
1248
+ " \"token_f1\": token_f1(prediction, reference),\n",
1249
+ " }\n",
1250
+ " )\n",
1251
+ "\n",
1252
+ "save_predictions_jsonl(PREDICTIONS_JSONL_PATH, prediction_rows)\n",
1253
+ "\n",
1254
+ "print(\"Generation metrics:\")\n",
1255
+ "print(json.dumps(generation_metrics, indent=2, ensure_ascii=False))\n",
1256
+ "\n",
1257
+ "print_banner(\"Sample validation predictions\")\n",
1258
+ "for row in prediction_rows[:NUM_PRINTED_PREDICTIONS]:\n",
1259
+ " print(f\"\\n[Q] {row['question']}\")\n",
1260
+ " print(f\"[REF] {row['reference_answer']}\")\n",
1261
+ " print(f\"[PRED] {row['predicted_answer']}\")\n",
1262
+ " print(f\"[EM={row['exact_match']:.0f}, TokenF1={row['token_f1']:.4f}]\")"
1263
+ ]
1264
+ },
1265
+ {
1266
+ "cell_type": "markdown",
1267
+ "id": "255eb7de",
1268
+ "metadata": {},
1269
+ "source": [
1270
+ "## Part 17 - Save metrics\n",
1271
+ "\n",
1272
+ "This section writes the current metrics to JSON.\n",
1273
+ "\n"
1274
+ ]
1275
+ },
1276
+ {
1277
+ "cell_type": "code",
1278
+ "execution_count": 36,
1279
+ "id": "ebd241d3",
1280
+ "metadata": {},
1281
+ "outputs": [
1282
+ {
1283
+ "name": "stdout",
1284
+ "output_type": "stream",
1285
+ "text": [
1286
+ "Metrics saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/final_metrics.json\n",
1287
+ "Predictions saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/validation_predictions.jsonl\n"
1288
+ ]
1289
+ }
1290
+ ],
1291
+ "source": [
1292
+ "save_json(METRICS_JSON_PATH, final_metrics)\n",
1293
+ "print(f\"Metrics saved to: {METRICS_JSON_PATH}\")\n",
1294
+ "print(f\"Predictions saved to: {PREDICTIONS_JSONL_PATH}\")"
1295
+ ]
1296
+ },
1297
+ {
1298
+ "cell_type": "markdown",
1299
+ "id": "429b8fea",
1300
+ "metadata": {},
1301
+ "source": [
1302
+ "## Part 18 - Merge the LoRA adapter and export the final model\n",
1303
+ "\n",
1304
+ "This section reloads the base model, merges the LoRA adapter, saves the merged model directory, and optionally exports a `.pt` file.\n",
1305
+ "\n"
1306
+ ]
1307
+ },
1308
+ {
1309
+ "cell_type": "code",
1310
+ "execution_count": 37,
1311
+ "id": "720f1089",
1312
+ "metadata": {},
1313
+ "outputs": [
1314
+ {
1315
+ "name": "stdout",
1316
+ "output_type": "stream",
1317
+ "text": [
1318
+ "\n",
1319
+ "========================================================================================\n",
1320
+ "Step 9 - Save merged model\n",
1321
+ "========================================================================================\n"
1322
+ ]
1323
+ },
1324
+ {
1325
+ "data": {
1326
+ "application/vnd.jupyter.widget-view+json": {
1327
+ "model_id": "00dc5873f54b4054853f7908bd366489",
1328
+ "version_major": 2,
1329
+ "version_minor": 0
1330
+ },
1331
+ "text/plain": [
1332
+ "Loading weights: 0%| | 0/399 [00:00<?, ?it/s]"
1333
+ ]
1334
+ },
1335
+ "metadata": {},
1336
+ "output_type": "display_data"
1337
+ },
1338
+ {
1339
+ "data": {
1340
+ "application/vnd.jupyter.widget-view+json": {
1341
+ "model_id": "5c8f48a945a84fa5b75150c6cb3939d6",
1342
+ "version_major": 2,
1343
+ "version_minor": 0
1344
+ },
1345
+ "text/plain": [
1346
+ "Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]"
1347
+ ]
1348
+ },
1349
+ "metadata": {},
1350
+ "output_type": "display_data"
1351
+ },
1352
+ {
1353
+ "name": "stdout",
1354
+ "output_type": "stream",
1355
+ "text": [
1356
+ "Merged model saved to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/merged_model\n",
1357
+ "\n",
1358
+ "========================================================================================\n",
1359
+ "Saving single .pt state_dict export\n",
1360
+ "========================================================================================\n",
1361
+ "Saved .pt file to: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/Qwen3-8B-Instruct_UM_Handbook.pt\n"
1362
+ ]
1363
+ }
1364
+ ],
1365
+ "source": [
1366
+ "def load_base_model_for_merge(model_path: Path, backend: str):\n",
1367
+ " compute_dtype = detect_compute_dtype(backend)\n",
1368
+ " model_kwargs = {\n",
1369
+ " \"pretrained_model_name_or_path\": str(model_path),\n",
1370
+ " \"torch_dtype\": compute_dtype,\n",
1371
+ " \"low_cpu_mem_usage\": LOW_CPU_MEM_USAGE,\n",
1372
+ " \"trust_remote_code\": False,\n",
1373
+ " }\n",
1374
+ " if backend == \"cuda\":\n",
1375
+ " model_kwargs[\"device_map\"] = \"auto\"\n",
1376
+ " model = AutoModelForCausalLM.from_pretrained(**model_kwargs)\n",
1377
+ " if backend in {\"mps\", \"cpu\"}:\n",
1378
+ " model = model.to(backend)\n",
1379
+ " return model\n",
1380
+ "\n",
1381
+ "def save_single_pt_state_dict(model, path: Path) -> None:\n",
1382
+ " print_banner(\"Saving single .pt state_dict export\")\n",
1383
+ " ensure_dir(path.parent)\n",
1384
+ "\n",
1385
+ " cpu_state_dict = {}\n",
1386
+ " for key, value in model.state_dict().items():\n",
1387
+ " cpu_state_dict[key] = value.detach().cpu()\n",
1388
+ "\n",
1389
+ " torch.save(\n",
1390
+ " {\n",
1391
+ " \"model_state_dict\": cpu_state_dict,\n",
1392
+ " \"base_model_name\": BASE_MODEL_NAME,\n",
1393
+ " \"system_prompt\": SYSTEM_PROMPT,\n",
1394
+ " \"max_seq_length\": MAX_SEQ_LENGTH,\n",
1395
+ " },\n",
1396
+ " str(path),\n",
1397
+ " )\n",
1398
+ " print(f\"Saved .pt file to: {path}\")\n",
1399
+ "\n",
1400
+ "print_banner(\"Step 9 - Save merged model\")\n",
1401
+ "cleanup_memory()\n",
1402
+ "\n",
1403
+ "if SAVE_MERGED_MODEL:\n",
1404
+ " base_model_for_merge = load_base_model_for_merge(local_model_path, RUNTIME_DEVICE_BACKEND)\n",
1405
+ " merged_model = PeftModel.from_pretrained(base_model_for_merge, str(ADAPTER_OUTPUT_DIR))\n",
1406
+ " merged_model = merged_model.merge_and_unload()\n",
1407
+ "\n",
1408
+ " ensure_dir(MERGED_MODEL_DIR)\n",
1409
+ " merged_model.save_pretrained(str(MERGED_MODEL_DIR), safe_serialization=True)\n",
1410
+ "\n",
1411
+ " if SAVE_TOKENIZER_WITH_MERGED:\n",
1412
+ " tokenizer.save_pretrained(str(MERGED_MODEL_DIR))\n",
1413
+ "\n",
1414
+ " print(f\"Merged model saved to: {MERGED_MODEL_DIR}\")\n",
1415
+ "\n",
1416
+ " if SAVE_SINGLE_PT:\n",
1417
+ " save_single_pt_state_dict(merged_model, FINAL_PT_PATH)\n",
1418
+ "\n",
1419
+ " del merged_model\n",
1420
+ " del base_model_for_merge\n",
1421
+ " cleanup_memory()"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "cell_type": "markdown",
1426
+ "id": "f2973f9b",
1427
+ "metadata": {},
1428
+ "source": [
1429
+ "## Part 19 - End-of-training summary\n",
1430
+ "\n",
1431
+ "This section prints the final output paths.\n",
1432
+ "\n"
1433
+ ]
1434
+ },
1435
+ {
1436
+ "cell_type": "code",
1437
+ "execution_count": 38,
1438
+ "id": "6891902c",
1439
+ "metadata": {},
1440
+ "outputs": [
1441
+ {
1442
+ "name": "stdout",
1443
+ "output_type": "stream",
1444
+ "text": [
1445
+ "\n",
1446
+ "========================================================================================\n",
1447
+ "Done\n",
1448
+ "========================================================================================\n",
1449
+ "Selected backend: cuda\n",
1450
+ "Adapter directory: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/lora_adapter\n",
1451
+ "Merged model directory: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/merged_model\n",
1452
+ "Single .pt file: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/Qwen3-8B-Instruct_UM_Handbook.pt\n",
1453
+ "Metrics JSON: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/final_metrics.json\n",
1454
+ "Predictions JSONL: /scr/user/kevin2002/TensorCat/NLP/UM_Handbook/outputs/qwen3_um_handbook/validation_predictions.jsonl\n"
1455
+ ]
1456
+ }
1457
+ ],
1458
+ "source": [
1459
+ "total_runtime_minutes = None\n",
1460
+ "try:\n",
1461
+ " # 如果 notebook 从头开始运行,这个变量就存在\n",
1462
+ " total_runtime_minutes = \"See notebook runtime from execution order / timestamps\"\n",
1463
+ "except Exception:\n",
1464
+ " pass\n",
1465
+ "\n",
1466
+ "final_metrics[\"completion_note\"] = \"Notebook execution completed.\"\n",
1467
+ "save_json(METRICS_JSON_PATH, final_metrics)\n",
1468
+ "\n",
1469
+ "print_banner(\"Done\")\n",
1470
+ "print(f\"Selected backend: {RUNTIME_DEVICE_BACKEND}\")\n",
1471
+ "print(f\"Adapter directory: {ADAPTER_OUTPUT_DIR}\")\n",
1472
+ "print(f\"Merged model directory: {MERGED_MODEL_DIR}\")\n",
1473
+ "print(f\"Single .pt file: {FINAL_PT_PATH if SAVE_SINGLE_PT else 'disabled'}\")\n",
1474
+ "print(f\"Metrics JSON: {METRICS_JSON_PATH}\")\n",
1475
+ "print(f\"Predictions JSONL: {PREDICTIONS_JSONL_PATH}\")"
1476
+ ]
1477
+ },
1478
+ {
1479
+ "cell_type": "markdown",
1480
+ "id": "e35a1ca8",
1481
+ "metadata": {},
1482
+ "source": [
1483
+ "## Part 20 - Result inspection\n",
1484
+ "\n",
1485
+ "Check these files after training:\n",
1486
+ "\n",
1487
+ "### 1. `final_metrics.json`\n",
1488
+ "Review the overall metrics.\n",
1489
+ "\n",
1490
+ "### 2. `validation_predictions.jsonl`\n",
1491
+ "Inspect generated answers against the reference answers.\n",
1492
+ "\n",
1493
+ "### 3. `merged_model/`\n",
1494
+ "Use this directory for standard Hugging Face loading.\n",
1495
+ "\n",
1496
+ "### 4. `Qwen3-8B-Instruct_UM_Handbook.pt`\n",
1497
+ "This is the optional single-file export.\n",
1498
+ "\n"
1499
+ ]
1500
+ },
1501
+ {
1502
+ "cell_type": "code",
1503
+ "execution_count": null,
1504
+ "id": "91778773",
1505
+ "metadata": {},
1506
+ "outputs": [],
1507
+ "source": []
1508
+ }
1509
+ ],
1510
+ "metadata": {
1511
+ "kernelspec": {
1512
+ "display_name": "Python (TensorCat Py3.10)",
1513
+ "language": "python",
1514
+ "name": "tensorcat-py310"
1515
+ },
1516
+ "language_info": {
1517
+ "codemirror_mode": {
1518
+ "name": "ipython",
1519
+ "version": 3
1520
+ },
1521
+ "file_extension": ".py",
1522
+ "mimetype": "text/x-python",
1523
+ "name": "python",
1524
+ "nbconvert_exporter": "python",
1525
+ "pygments_lexer": "ipython3",
1526
+ "version": "3.10.14"
1527
+ }
1528
+ },
1529
+ "nbformat": 4,
1530
+ "nbformat_minor": 5
1531
+ }
UM_Handbook/Dataset/Manual_Index/UM_Manual_Core_Question_Index.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Draft_Build_Report.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "baseline_1",
3
+ "format": "question_answer_only",
4
+ "inputs": {
5
+ "index_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/Manual_Index/UM_Manual_Index.json",
6
+ "chunk_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl"
7
+ },
8
+ "outputs": {
9
+ "metadata_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Metadata_Draft.jsonl",
10
+ "metadata_pretty_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Metadata_Draft_pretty.json",
11
+ "training_ready_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Training_Draft.jsonl",
12
+ "training_ready_pretty_path": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/SFT_Dataset_Draft/SFT_QA_Training_Draft_pretty.json"
13
+ },
14
+ "counts": {
15
+ "index_rows": 395,
16
+ "chunk_rows": 521,
17
+ "metadata_rows": 395,
18
+ "training_ready_rows": 388,
19
+ "matched_rows": 388,
20
+ "unmatched_rows": 7,
21
+ "filtered_bad_match_rows": 0
22
+ },
23
+ "notes": [
24
+ "This build is for Baseline 1 only.",
25
+ "Training-ready rows contain only question and answer fields.",
26
+ "Exact linked_index_id candidates are preferred when available.",
27
+ "Bad cover/content/heading-only answers are filtered out.",
28
+ "Vision/Mission/Objectives questions use explicit label-aware extraction when possible."
29
+ ]
30
+ }
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Metadata_Draft_pretty.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT Dataset Core Questions Draft/SFT_QA_Training_Draft_pretty.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Metadata_pretty.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/SFT_Dataset/SFT_QA_Training_Ready_pretty.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_pretty.json ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/Source Chunk Dataset/Source_Chunks_Dataset_report.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_chunks": 521,
3
+ "scope_distribution": {
4
+ "general": 58,
5
+ "postgraduate": 250,
6
+ "undergraduate": 213
7
+ },
8
+ "notes": [
9
+ "Chunks are generated from the structured markdown files, not directly from raw PDF pages.",
10
+ "Low-information cover/content/divider chunks are filtered out.",
11
+ "Chunk pages are preserved from per-page markdown markers when available.",
12
+ "Linked Manual_Index ids are based on exact section/subsection matches from UM_Manual_Index.json."
13
+ ]
14
+ }
UM_Handbook/Dataset/markdown/complete_handbook_structured.md ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/Dataset/markdown/general_handbook_structured.md ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # General Handbook (Structured Markdown)
2
+
3
+
4
+
5
+ ## Faculty Objectives :: Faculty Objectives
6
+
7
+ - scope_label: general
8
+ - source_doc: General Handbook
9
+ - pages: 9-9
10
+
11
+ ### Page 9
12
+ VISION
13
+ A global faculty impacting the world
14
+ MISSION
15
+ Propelling computing technology and
16
+ producing world class leaders
17
+ OBJECTIVES
18
+ To sustain an outstanding faculty dedicated to excellence in
19
+ undergraduate and postgraduate teaching, learning and research.
20
+ To contribute towards the development of the nation through the
21
+ production of quality research and publications.
22
+ To provide innovative academic programs that can respond to the
23
+ changing needs of the society.
24
+ To produce quality graduates who are equipped with advanced
25
+ knowledge and skills of computer science and information technology.
26
+
27
+
28
+ ## History of the Faculty :: History Overview
29
+
30
+ - scope_label: general
31
+ - source_doc: General Handbook
32
+ - pages: 10-11
33
+
34
+ ### Page 10
35
+ The provision of computer facilities and services at the Universiti Malaya
36
+ (UM) began soon after the Computer Centre was officially formed in 1965.
37
+ This made the university one of the pioneers in computer usage in Malaysia.
38
+ In December 1969, the Computer Centre took on an additional role of
39
+ teaching and research of computer science and information technology. The
40
+ Computer Centre Board was formed, comprising the Vice-Chancellor (as
41
+ Chairman), the Director of Computer Centre (as Secretary), and a
42
+ representative from each Faculty, Institute, Centre and the University
43
+ Senate.
44
+ In 1974, the Diploma in Computer Science programme was introduced.
45
+ From its inception in the 1974/1975 Session to the 1997/1998 Session, a
46
+ total of 300 students had been awarded the Diploma. The Master of
47
+ Computer Science (MCS) and Doctor of Philosophy (Ph.D.) programme
48
+ were two (2) higher degree programme by research approved by the Senate
49
+ and had been administered by the Computer Centre since 1985. In addition,
50
+ the Computer Centre offered a four (4) years Bachelor of Computer Science
51
+ programme. The first undergraduate enrolment for the 1990/1991 Session
52
+ was 50 students.
53
+ In April 1st, 1993, the University Senate agreed to the formation of the
54
+ Computer Centre Study Board. The Board proposed the establishment of a
55
+ faculty to be called the Faculty of Computer Science and Information
56
+ Technology (FCSIT). The existing Computer Centre was to be annulled and
57
+ replaced by a Computer Services Division which was placed under the
58
+ Chancellery.
59
+ On September 22nd, 1994, the University of Malaya Council agreed to the
60
+ formation of the Faculty of Computer Science and Information Technology
61
+ (FCSIT), and the Computer Services Division. A sum of 4.2 million was
62
+ obtained from the Ministry of Education under the Sixth Malaysian Plan to
63
+ put up a new building for the faculty, with the necessary infrastructure for
64
+ teaching, learning and research. The building was officially declared open by
65
+ the Minister of Education, Dato' Sri Najib Tun Abdul Razak on September
66
+ 26th, 1996.
67
+ HISTORY OF THE FACULTY
68
+
69
+ ### Page 11
70
+ The Bachelor of Information Technology programme started in the
71
+ 1996/1997 Session, with an initial intake of 50 students. To accommodate
72
+ an increase student population, an additional building was built which was
73
+ officially opened by Datuk Fong Chan Onn, Deputy Minister of Education on
74
+ September 21st, 1998.
75
+ Since its establishment, the Faculty of Computer Science and Information
76
+ Technology have been led by a number of distinguished persons. The
77
+ following have served as Directors/Dean:
78
+ HISTORY OF THE FACULTY
79
+ 1967 – 1973
80
+ 1973 – 1975
81
+ 1975 – 1978
82
+ 1978 – 1982
83
+ 1982 – 1990
84
+ 1990 – 1992
85
+ 1992 – 2000
86
+ 2000 – 2002
87
+ 2002 – 2004
88
+ 2004 – 2005
89
+ 2005 – 2006
90
+ 2006 – 2007
91
+ 2007 – 2009
92
+ 2009 – 2010
93
+ 2010 – 2011
94
+ 2011 – 2014
95
+ 2014 – 2017
96
+ 2017 – 2019
97
+ 2019 - 2021
98
+ 2022 –2024
99
+ 2024 - 2025
100
+ 2025 - Current
101
+ Mr. Ong Yin Fook
102
+ Professor Paul Peach
103
+ Dr. R.K. Pillay
104
+ Dr. Tan Bock Thiam
105
+ Assoc. Prof. Ir. Dr. Mashkuri Yaacob
106
+ Professor Lee Poh Aun
107
+ Professor Ir. Dr. Mashkuri Yaacob
108
+ Assoc. Prof. Dr. Siti Salwah Salim
109
+ Assoc. Prof. Dr. Zainab Awang Ngah
110
+ Professor Ir. Dr. N. Selvanathan
111
+ Assoc. Prof. Dr. Siti Salwah Salim
112
+ Professor Dato' Dr. Ir. Mashkuri Hj. Yaacob
113
+ Professor Dr. Mohd Sapiyan Baba
114
+ Professor Dr. David Ngo Chek Ling
115
+ Professor Dr. Wan Ahmad Tajuddin Wan Abdullah
116
+ Professor Dr. Siti Salwah Salim
117
+ Professor Dr. Abdullah Gani
118
+ Professor Dr. Abrizah Abdullah
119
+ Professor Datin Dr. Sameem Abdul Kareem
120
+ Professor Dr. Loo Chu Kiong
121
+ Professor Ir. Dr. Chan Chee Seng
122
+ Associate Professor Dr. Norisma Idris
123
+
124
+
125
+ ## Academic Calendar 2025/2026 :: Master and Doctorate Level Academic Calendar
126
+
127
+ - scope_label: postgraduate
128
+ - source_doc: General Handbook
129
+ - pages: 4-4
130
+
131
+ ### Page 4
132
+ Lampiran B2
133
+ ACADEMIC CALENDAR 2025/2026 ACADEMIC SESSION
134
+ (MASTER AND DOCTORATE LEVEL)
135
+ AMENDMENT
136
+ ‘SEMESTER |
137
+ ‘Orientation (Week of Welcome)-WOW 1 week 05.10.2025 = «= 12.10.2025
138
+ Lectures, 6 weeks" 19.10.2028 - 23.11.2025
139
+ Mid Semester | Break 1 week 24.11.2025 = © 30.11.2025
140
+ Lectures. 8 weeks" 01.12.2025 = �� 25.01.2026
141
+ Revision Week 1 weeks" 26.01.2028 © - 01.02.2026
142
+ ‘Semester | Final Examination 3 weeks" 02.02.2026 - © 2202.2026
143
+ ‘Semester | Break 2 week 23.02.2028 - 08.03.2026
144
+ 22 weeks
145
+ ‘SEMESTER I
146
+ Lectures: 7 weeks" 09.03.2026 - 26.04.2026
147
+ ‘Mid Semester II Break 1 week 27.04.2028 © - 03.05.2026
148
+ Lectures, 7 weeks* 04052026 - 21.06.2026
149
+ Revision Week 1 week* 22.06.2026 - 28.06.2026
150
+ ‘Semester II Final Examination 3 weeks" 29.06.2026 - 19.07.2026
151
+ ‘Semester Il Break 4 _weeks 20.07.2028 - 16.08.2026
152
+ 23 weeks
153
+ ‘SPECIAL SEMESTER
154
+ Lectures: 7 weeks" 27.07.2026 - ~— 13.09.2026
155
+ ‘Special Semester Final Examination 1 week* 14092028 © - 20.09.2026
156
+ Special Semester Break 1_week 21.09.2026 28.09.2026
157
+ 9 weeks
158
+ Notes:
159
+ (1) The Module Registration and Examination Schedule can be referred to at tips /umsitsquide um edu my. All,
160
+ information is subject to change.
161
+ (0) The Academic Calendar has taken into account public and festive holidays and is subject to change
162
+ Deepavali 20 October 2025 (Monday)
163
+ CChvistnas Day 2 December 2025 (Thursday)
164
+ New Year 04 Janvary 2026 (Thursday)
165
+ ‘Thaipusam 01 February 2026 (Sunday)
166
+ Federal Tertitory Day 01 February 2026 (Sunday)
167
+ ‘Chinese New Year 17 & 18 Fetruary 2026 (Tuesday & Wednesday)
168
+ Nuzul Al-Quran, 07 March 2026 (Saturday)
169
+ Edu Fn 20.21 March 2026 (Friday & Saturday)
170
+ Labour Day 1 May 2026 (Friday)
171
+ idl Adhal 27 May 2026 (Wednesday)
172
+ Wesak Day 34 May 2026 (Sunday)
173
+ His Majesty the King's Birthday (1 June 2026 (Monday)
174
+ ‘Awal Muharram 16 June 2028 (Tuesday)
175
+ Prophet Munammads (Mauidur Rasu) 26 August 2026 (Tuesday)
176
+ atonal Day 31 August 2026 (Monday)
177
+ Malaysia Day 16 September 2026 (Wednesday)
178
+ (DUM PG est 2026 (09.11 June 2026 (Tuesday - Thursday)
179
+ *Senate August 28, 2025
180
+
181
+
182
+ ## Academic Calendar 2025/2026 :: Bachelor Degree Level Academic Calendar
183
+
184
+ - scope_label: undergraduate
185
+ - source_doc: General Handbook
186
+ - pages: 12-12
187
+
188
+ ### Page 12
189
+ Ey
190
+ ACADEMIC CALENDAR et
191
+ SESSION 2025/2026 A
192
+ eee
193
+ ACADEMIC CALENDAR 2025/2026 ACADEMIC SESSION
194
+ (BACHELOR DEGREE LEVEL)
195
+ AMENDMENT
196
+ SEMESTER |
197
+ Orientation (Week of Welcome)-WOW 1 week 05.10.2025 - 12.10.2025
198
+ Lectures 6 weeks* 13.10.2025 - = 23.11.2025
199
+ Mid Semester | Break 1 week 24.11.2025 - 30.11.2025
200
+ Lectures, 8 weeks* 01.12.2025 - — 26.01.2026
201
+ Revision Week 1 weeks* 26.01.2026 - 01.02.2026
202
+ ‘Semester | Final Examination 3 weeks* 02.02.2026 - 22.02.2026
203
+ ‘Semester | Break 2 week 23.02.2026 - 08.03.2026
204
+ 22 weeks
205
+ ‘SEMESTER I
206
+ Lectures 7 weeks* 09.03.2026 - 26.04.2026
207
+ Mid Semester II Break 1 week 27.04.2026 - 03.05.2026
208
+ Lectures. 7 weeks* 04.05.2026 - 21.06.2026
209
+ Revision Week 1 week" 22.06.2026 - 28.06.2026
210
+ ‘Semester II Final Examination 3 weeks* 29.06.2026 - 19.07.2026
211
+ ‘Semester II Break 4_weeks 20.07.2026 - 16.08.2026
212
+ 23 weeks
213
+ ‘SPECIAL SEMESTER
214
+ Lectures: 7 weeks* 27.07.2026 - — 13.09.2026
215
+ ‘Special Semester Final Examination 1 week" 14.09.2026 - 20.09.2026
216
+ ‘Special Semester Break 1_week 21.09.2026 28.09.2026
217
+ 9 weeks
218
+ Notes:
219
+ (1) The Module Registration and Examination Schedule can be referred to at https://umsitsquide um edumy, All
220
+ information is subject to change.
221
+ (() The Academic Calendar has taken into account public and festive holidays and is subject to change’
222
+ Deepavali 20 October 2028 (Monday)
223
+ ‘Christmas Day 25 December 2025 (Thursday)
224
+ New Year 01 January 2026 (Thursday)
225
+ ‘Thaipusam 01 February 2026 (Sunday)
226
+ Federal Tertory Day (01 February 2026 (Sunday)
227
+ Chinese New Year 17 & 18 February 2026 (Tuesday & Wednesday)
228
+ ‘Nuzul Al-Quran 07 March 2026 (Saturday)
229
+ Eidul Ft 20 & 21 March 2026 (Friday & Saturday)
230
+ Labour Day, 01 May 2026 (Friday)
231
+ Eidul Adna (27 May 2026 (Wednesday)
232
+ Wesak Day 31 May 2026 (Sunday)
233
+ His Majesty the King's Birthday 1 June 2026 (Monday)
234
+ ‘Awal Muharram 16 June 2026 (Tuesday)
235
+ Prophet Muhammad's (Maulidur Rasul) 25 August 2026 (Tuesday)
236
+ ‘National Day 31 August 2026 (Monday)
237
+ Malaysia Day 16 September 2026 (Wednesday)
238
+ () UM UG Fest 2026 (09-11 June 2026 (Tuesday - Thursday)
239
+ *Senate August 28, 2025
240
+
241
+
242
+ ## Teaching and Learning Facilities :: Teaching Labs
243
+
244
+ - scope_label: general
245
+ - source_doc: General Handbook
246
+ - pages: 13-14
247
+
248
+ ### Page 13
249
+ (A) TEACHING LABS
250
+ The Faculty of Computer Science and Information Technology provide 9 laboratories for
251
+ teaching and learning purposes. The laboratories are as follows:
252
+ BLOCK A
253
+ Micro Lab 1 (MM1)
254
+ This lab has 50 units of computer that are connected to Windows Active Directory servers and
255
+ the Internet. The operating system for these PCs is Windows 10. This lab is opened to all
256
+ FSKTM undergraduate students.
257
+ Micro Lab 2 (MM2)
258
+ This lab has 12 units of computer that are connected to Windows Active Directory servers and
259
+ the Internet. The operating system for these PCs is Windows 10. This lab is opened to all
260
+ FSKTM undergraduate students.
261
+ Postgraduate Lab (ML)
262
+ This lab has 33 units of computer. All the computers are connected to Windows Active Directory
263
+ servers and the Internet. The operating system for these PCs is Windows 10. This lab is opened
264
+ to all FSKTM postgraduate students.
265
+ CCNA LAB (CCNA)
266
+ This lab has 41 units of computer. The operating system for these workstations is Windows 10.
267
+ There are also 25 units of Cisco 1700 Series Router, 4 units Cisco 1760 Series Router and 12
268
+ units switch Cisco 2950 CATALYST Series. This lab is opened to all FSKTM students.
269
+ Robotic Teaching Lab
270
+ The Robotic Teaching Lab @ FCSIT is part of the Department of Artificial Intelligence effort to
271
+ provide conducive intelligent learning environment to students taking the 'Intelligent Robotics'
272
+ course. Equipped with six mobile robots, the lab allows space for hands-on and robotic
273
+ experiments designed to help students understand the concept of robotic intelligence and
274
+ acquire the needful skills for the course.
275
+ TEACHING AND LEARNING FACILITIES
276
+ FACULTY OF COMPUTER SCIENCE AND
277
+ INFORMATION TECHNOLOGY
278
+
279
+ ### Page 14
280
+ BLOCK B
281
+ Micro Lab 3 (MM3)
282
+ This lab has 60 units of computer that are connected to Windows Active Directory servers and
283
+ the Internet. This lab is opened to undergraduate and postgraduate students.
284
+ Micro Lab 4 (MM4)
285
+ This lab has 60 units of computer that are connected to Windows Active Directory servers and
286
+ the Internet. This lab is opened to undergraduate and postgraduate students.
287
+ Micro Lab 6 (MM6)
288
+ This lab has 45 units of computer that are connected to Windows Active Directory servers and
289
+ the Internet. This lab is opened to all FSKTM students but priority is given to multimedia courses.
290
+ Operating system – Windows 10.
291
+ Stroustrup Lab 1
292
+ This lab has 42 units of computer that are connected to the Internet. This lab is opened to
293
+ undergraduate students. Operating system – Windows 10.
294
+ (B) RESEARCH LABS
295
+ 29 research labs to support postgraduate students research activities, managed by various
296
+ departments in the faculty:
297
+ BLOCK A
298
+ Computer Technology Lab
299
+ This lab is opened to post-graduate student, priority given to students who are taking courses
300
+ related to the field Computer Technology.
301
+ TEACHING AND LEARNING FACILITIES
302
+ FACULTY OF COMPUTER SCIENCE AND
303
+ INFORMATION TECHNOLOGY
304
+
305
+
306
+ ## Teaching and Learning Facilities :: Research Labs
307
+
308
+ - scope_label: general
309
+ - source_doc: General Handbook
310
+ - pages: 14-16
311
+
312
+ ### Page 14
313
+ BLOCK B
314
+ Micro Lab 3 (MM3)
315
+ This lab has 60 units of computer that are connected to Windows Active Directory servers and
316
+ the Internet. This lab is opened to undergraduate and postgraduate students.
317
+ Micro Lab 4 (MM4)
318
+ This lab has 60 units of computer that are connected to Windows Active Directory servers and
319
+ the Internet. This lab is opened to undergraduate and postgraduate students.
320
+ Micro Lab 6 (MM6)
321
+ This lab has 45 units of computer that are connected to Windows Active Directory servers and
322
+ the Internet. This lab is opened to all FSKTM students but priority is given to multimedia courses.
323
+ Operating system – Windows 10.
324
+ Stroustrup Lab 1
325
+ This lab has 42 units of computer that are connected to the Internet. This lab is opened to
326
+ undergraduate students. Operating system – Windows 10.
327
+ (B) RESEARCH LABS
328
+ 29 research labs to support postgraduate students research activities, managed by various
329
+ departments in the faculty:
330
+ BLOCK A
331
+ Computer Technology Lab
332
+ This lab is opened to post-graduate student, priority given to students who are taking courses
333
+ related to the field Computer Technology.
334
+ TEACHING AND LEARNING FACILITIES
335
+ FACULTY OF COMPUTER SCIENCE AND
336
+ INFORMATION TECHNOLOGY
337
+
338
+ ### Page 15
339
+ TEACHING AND LEARNING FACILITIES
340
+ FACULTY OF COMPUTER SCIENCE AND
341
+ INFORMATION TECHNOLOGY
342
+ BLOCK B
343
+ Artificial Intelligence Research Lab
344
+ Qualitative reasoning, qualitative modeling, Intelligent Tutoring System, Case-based System,
345
+ Intelligent Interactive Multimedia System.
346
+ VLSI Research Lab
347
+ The study of the performance and the implementation of fast pipelined floating-point arithmetic
348
+ circuits and arithmetic algorithm, as well as on designing VLSI. Focus is given to the aspect of
349
+ VLSI circuits test.
350
+ Computer Systems and Network Research Lab
351
+ Focus on data security research through networking, ability of protocols and ATM studies.
352
+ Multimedia Research Lab
353
+ Research and development comprise:
354
+ Corporate training
355
+ Smart school education software
356
+ Distributed multimedia systems
357
+ Web-based multimedia systems
358
+ Multimedia Storage & retrieval technology
359
+ Multimedia input & output technology
360
+ Human Computer Interaction (HCI) Research Lab
361
+ This lab used is for conducting research on usability area, computer support cooperative work
362
+ (CSCW) and task analysis. It involves task analysis hierarchy chart for user understandability test
363
+ in implementing any task.
364
+
365
+ ### Page 16
366
+ Information System Research Lab
367
+ This lab is used for conducting research on dissimilar information systems integration in
368
+ heterogeneous environment including operating system, hardware, language and the use of the
369
+ latest software industrial standard to integrate information systems.
370
+ Research and development on:
371
+ Business Oriented Systems/ Electronic Government Systems
372
+ Geographic Information Systems
373
+ Inter-organizational Information Systems
374
+ Web-based Information Systems
375
+ Smart Card Application
376
+ Stroustrup Lab 2
377
+ This lab has 18 units of computer that are connected to the Internet. This lab is opened to
378
+ undergraduate students taking courses related to electronic circuit.
379
+ (C) PROJECT BASED LAB
380
+ Artificial Intelligence 4 U (AI4U)
381
+ AI-based Machine Vision essentials. Key objective is to transfer ‘AI-based machine
382
+ vision’ knowledge to university lecturers and students.
383
+ Wisma R&D (15th floor):
384
+ Web Based Information System Lab & Knowledge Engineering Lab (Open-Space
385
+ Concept)
386
+ Both the Knowledge Engineering Lab and the Web-Based Information System Lab are open-
387
+ space concept labs accessible to all postgraduate students, regardless of their field. Students
388
+ can use the space and facilities provided in these labs, with permission for access. These labs
389
+ are located on the 15th floor of Wisma R&D
390
+ Robotedge AI Robotic Lab
391
+ This lab is previously known as Natural Language Processing Lab. This lab is equipped with
392
+ equipment for AI robotics research and development focusing on environmental, home services,
393
+ and search and rescue research areas.
394
+ TEACHING AND LEARNING FACILITIES
395
+ FACULTY OF COMPUTER SCIENCE AND
396
+ INFORMATION TECHNOLOGY
397
+
398
+
399
+ ## Teaching and Learning Facilities :: Project Based Labs
400
+
401
+ - scope_label: general
402
+ - source_doc: General Handbook
403
+ - pages: 16-16
404
+
405
+ ### Page 16
406
+ Information System Research Lab
407
+ This lab is used for conducting research on dissimilar information systems integration in
408
+ heterogeneous environment including operating system, hardware, language and the use of the
409
+ latest software industrial standard to integrate information systems.
410
+ Research and development on:
411
+ Business Oriented Systems/ Electronic Government Systems
412
+ Geographic Information Systems
413
+ Inter-organizational Information Systems
414
+ Web-based Information Systems
415
+ Smart Card Application
416
+ Stroustrup Lab 2
417
+ This lab has 18 units of computer that are connected to the Internet. This lab is opened to
418
+ undergraduate students taking courses related to electronic circuit.
419
+ (C) PROJECT BASED LAB
420
+ Artificial Intelligence 4 U (AI4U)
421
+ AI-based Machine Vision essentials. Key objective is to transfer ‘AI-based machine
422
+ vision’ knowledge to university lecturers and students.
423
+ Wisma R&D (15th floor):
424
+ Web Based Information System Lab & Knowledge Engineering Lab (Open-Space
425
+ Concept)
426
+ Both the Knowledge Engineering Lab and the Web-Based Information System Lab are open-
427
+ space concept labs accessible to all postgraduate students, regardless of their field. Students
428
+ can use the space and facilities provided in these labs, with permission for access. These labs
429
+ are located on the 15th floor of Wisma R&D
430
+ Robotedge AI Robotic Lab
431
+ This lab is previously known as Natural Language Processing Lab. This lab is equipped with
432
+ equipment for AI robotics research and development focusing on environmental, home services,
433
+ and search and rescue research areas.
434
+ TEACHING AND LEARNING FACILITIES
435
+ FACULTY OF COMPUTER SCIENCE AND
436
+ INFORMATION TECHNOLOGY
437
+
438
+
439
+ ## Other Facilities :: Student Support and Campus Facilities
440
+
441
+ - scope_label: general
442
+ - source_doc: General Handbook
443
+ - pages: 17-17
444
+
445
+ ### Page 17
446
+ Prayer Room (surau)
447
+ Air-conditioned prayer rooms (surau) (one for Men, and the other for Women) are provided in Block A for
448
+ Muslims to pray. The surau for Men is located at the second floor and surau for women is located at the first
449
+ floor in the building. Users are not allowed to sleep and eat in the surau. Users are also responsible for the
450
+ cleanliness of the surau.
451
+ Vending Machine (Drinks)
452
+ There are 4 units of vending machine for cold drinks located at Block A and Block B.
453
+ Cafeteria
454
+ Cafeteria is located at the back of Block A.
455
+ Postgraduate Lounge & Student Centre
456
+ Space provided for student to relaxing their mind, having informal discussion and make a small gathering. A few
457
+ facilities such as sofas, computers, discussion rooms and pantry are ready to use.
458
+ Parking Lot
459
+ The Faculty also provides parking lots for students to park their car or motorbike. Students can park their car or
460
+ motorbike at the back of Block A. There are 150 parking lots for the motorbike and 45 for the car. Students are
461
+ not allowed to park their car in front of both buildings because the parking lots are reserved for the faculty staff
462
+ and visitors.
463
+ Water Purifiers
464
+ Water purifiers are provided in both buildings and placed at every floor.
465
+ Internet Access at the building of FCSIT
466
+ There are WIFI Internet Access provided to students at every floor in each building. Students must obey the
467
+ rules and regulations during the usage of these facilities.
468
+ SPeCTRUM (Student Powered e-Collaboration Transforming UM)
469
+ This facility is for easy accessibility for student to upload their notes and information regarding their courses.
470
+ All faculties (excluding Faculty of Medicine & Faculty of Dentistry) and PASUM can browse the SPECTRUM
471
+ website at https://spectrum.um.edu.my/
472
+ For Faculty of Medicine and Faculty of Dentistry, SPECTRUM website can be browsed at
473
+ https://spectrumx.um.edu.my/
474
+ All queries and suggestions can be directed to https://helpdesk.um.edu.my/
475
+ Door Access
476
+ Students must register for door access for using research labs, Student Center and Postgraduate Lounge.
477
+ OTHER FACILITIES
478
+ FACULTY OF COMPUTER SCIENCE AND
479
+ INFORMATION TECHNOLOGY
480
+ 1.
481
+ 2.
482
+ 3.
483
+ 4.
484
+ 5.
485
+ 6.
486
+ 7.
487
+ 8.
488
+ 9.
UM_Handbook/Dataset/pdf/Complete Handbook.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19204e4c67cb49051387ad558bed58a302ea332f66ae82e297ffb19347ca5455
3
+ size 35690631
UM_Handbook/Dataset/pdf/General Handbook.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2e68a06c7da277fbe1256885f7062c4c91d3bb8bbcd341a583da45a5119b0ec
3
+ size 3286027
UM_Handbook/Dataset/reports/um_handbook_markdown_report.json ADDED
@@ -0,0 +1,2771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generated_files": {
3
+ "general_markdown": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/markdown/general_handbook_structured.md",
4
+ "complete_markdown": "/Users/kevin/PycharmProjects/TensorCat/NLP_Group_Project/UM_Handbook/Dataset/markdown/complete_handbook_structured.md"
5
+ },
6
+ "general_block_count": 8,
7
+ "complete_block_count": 75,
8
+ "general_blocks": [
9
+ {
10
+ "section": "Faculty Objectives",
11
+ "subsection": "Faculty Objectives",
12
+ "scope_label": "general",
13
+ "source_doc": "General Handbook",
14
+ "pages": [
15
+ 9,
16
+ 9
17
+ ],
18
+ "page_stats": [
19
+ {
20
+ "page": 9,
21
+ "source": "native",
22
+ "chars": 590,
23
+ "seconds": 0.03
24
+ }
25
+ ],
26
+ "total_chars": 601,
27
+ "seconds": 0.03
28
+ },
29
+ {
30
+ "section": "History of the Faculty",
31
+ "subsection": "History Overview",
32
+ "scope_label": "general",
33
+ "source_doc": "General Handbook",
34
+ "pages": [
35
+ 10,
36
+ 11
37
+ ],
38
+ "page_stats": [
39
+ {
40
+ "page": 10,
41
+ "source": "native",
42
+ "chars": 2073,
43
+ "seconds": 0.02
44
+ },
45
+ {
46
+ "page": 11,
47
+ "source": "native",
48
+ "chars": 1467,
49
+ "seconds": 0.01
50
+ }
51
+ ],
52
+ "total_chars": 3566,
53
+ "seconds": 0.02
54
+ },
55
+ {
56
+ "section": "Academic Calendar 2025/2026",
57
+ "subsection": "Master and Doctorate Level Academic Calendar",
58
+ "scope_label": "postgraduate",
59
+ "source_doc": "General Handbook",
60
+ "pages": [
61
+ 4,
62
+ 4
63
+ ],
64
+ "page_stats": [
65
+ {
66
+ "page": 4,
67
+ "source": "ocr",
68
+ "chars": 2004,
69
+ "seconds": 1.56
70
+ }
71
+ ],
72
+ "total_chars": 2015,
73
+ "seconds": 1.56
74
+ },
75
+ {
76
+ "section": "Academic Calendar 2025/2026",
77
+ "subsection": "Bachelor Degree Level Academic Calendar",
78
+ "scope_label": "undergraduate",
79
+ "source_doc": "General Handbook",
80
+ "pages": [
81
+ 12,
82
+ 12
83
+ ],
84
+ "page_stats": [
85
+ {
86
+ "page": 12,
87
+ "source": "ocr",
88
+ "chars": 2038,
89
+ "seconds": 1.58
90
+ }
91
+ ],
92
+ "total_chars": 2050,
93
+ "seconds": 1.58
94
+ },
95
+ {
96
+ "section": "Teaching and Learning Facilities",
97
+ "subsection": "Teaching Labs",
98
+ "scope_label": "general",
99
+ "source_doc": "General Handbook",
100
+ "pages": [
101
+ 13,
102
+ 14
103
+ ],
104
+ "page_stats": [
105
+ {
106
+ "page": 13,
107
+ "source": "native",
108
+ "chars": 1693,
109
+ "seconds": 0.01
110
+ },
111
+ {
112
+ "page": 14,
113
+ "source": "native",
114
+ "chars": 1182,
115
+ "seconds": 0.01
116
+ }
117
+ ],
118
+ "total_chars": 2901,
119
+ "seconds": 0.02
120
+ },
121
+ {
122
+ "section": "Teaching and Learning Facilities",
123
+ "subsection": "Research Labs",
124
+ "scope_label": "general",
125
+ "source_doc": "General Handbook",
126
+ "pages": [
127
+ 14,
128
+ 16
129
+ ],
130
+ "page_stats": [
131
+ {
132
+ "page": 14,
133
+ "source": "native",
134
+ "chars": 1182,
135
+ "seconds": 0.0
136
+ },
137
+ {
138
+ "page": 15,
139
+ "source": "native",
140
+ "chars": 1146,
141
+ "seconds": 0.01
142
+ },
143
+ {
144
+ "page": 16,
145
+ "source": "native",
146
+ "chars": 1630,
147
+ "seconds": 0.01
148
+ }
149
+ ],
150
+ "total_chars": 3998,
151
+ "seconds": 0.02
152
+ },
153
+ {
154
+ "section": "Teaching and Learning Facilities",
155
+ "subsection": "Project Based Labs",
156
+ "scope_label": "general",
157
+ "source_doc": "General Handbook",
158
+ "pages": [
159
+ 16,
160
+ 16
161
+ ],
162
+ "page_stats": [
163
+ {
164
+ "page": 16,
165
+ "source": "native",
166
+ "chars": 1630,
167
+ "seconds": 0.0
168
+ }
169
+ ],
170
+ "total_chars": 1642,
171
+ "seconds": 0.01
172
+ },
173
+ {
174
+ "section": "Other Facilities",
175
+ "subsection": "Student Support and Campus Facilities",
176
+ "scope_label": "general",
177
+ "source_doc": "General Handbook",
178
+ "pages": [
179
+ 17,
180
+ 17
181
+ ],
182
+ "page_stats": [
183
+ {
184
+ "page": 17,
185
+ "source": "native",
186
+ "chars": 2156,
187
+ "seconds": 0.01
188
+ }
189
+ ],
190
+ "total_chars": 2168,
191
+ "seconds": 0.01
192
+ }
193
+ ],
194
+ "complete_blocks": [
195
+ {
196
+ "section": "Postgraduate Faculty Identity",
197
+ "subsection": "Vision and Mission",
198
+ "scope_label": "postgraduate",
199
+ "source_doc": "Complete Handbook",
200
+ "pages": [
201
+ 186,
202
+ 186
203
+ ],
204
+ "page_stats": [
205
+ {
206
+ "page": 186,
207
+ "source": "native",
208
+ "chars": 714,
209
+ "seconds": 0.01
210
+ }
211
+ ],
212
+ "total_chars": 727,
213
+ "seconds": 0.01
214
+ },
215
+ {
216
+ "section": "Faculty Staff",
217
+ "subsection": "Dean's Office and Management",
218
+ "scope_label": "general",
219
+ "source_doc": "Complete Handbook",
220
+ "pages": [
221
+ 6,
222
+ 8
223
+ ],
224
+ "page_stats": [
225
+ {
226
+ "page": 6,
227
+ "source": "ocr",
228
+ "chars": 521,
229
+ "seconds": 0.62
230
+ },
231
+ {
232
+ "page": 7,
233
+ "source": "ocr",
234
+ "chars": 772,
235
+ "seconds": 0.75
236
+ },
237
+ {
238
+ "page": 8,
239
+ "source": "ocr",
240
+ "chars": 443,
241
+ "seconds": 0.4
242
+ }
243
+ ],
244
+ "total_chars": 1773,
245
+ "seconds": 1.77
246
+ },
247
+ {
248
+ "section": "Faculty Staff",
249
+ "subsection": "Department of Artificial Intelligence",
250
+ "scope_label": "general",
251
+ "source_doc": "Complete Handbook",
252
+ "pages": [
253
+ 9,
254
+ 12
255
+ ],
256
+ "page_stats": [
257
+ {
258
+ "page": 9,
259
+ "source": "ocr",
260
+ "chars": 1438,
261
+ "seconds": 1.0
262
+ },
263
+ {
264
+ "page": 10,
265
+ "source": "ocr",
266
+ "chars": 1620,
267
+ "seconds": 1.3
268
+ },
269
+ {
270
+ "page": 11,
271
+ "source": "ocr",
272
+ "chars": 1392,
273
+ "seconds": 1.17
274
+ },
275
+ {
276
+ "page": 12,
277
+ "source": "ocr",
278
+ "chars": 1388,
279
+ "seconds": 1.25
280
+ }
281
+ ],
282
+ "total_chars": 5891,
283
+ "seconds": 4.72
284
+ },
285
+ {
286
+ "section": "Faculty Staff",
287
+ "subsection": "Department of Software Engineering",
288
+ "scope_label": "general",
289
+ "source_doc": "Complete Handbook",
290
+ "pages": [
291
+ 13,
292
+ 16
293
+ ],
294
+ "page_stats": [
295
+ {
296
+ "page": 13,
297
+ "source": "ocr",
298
+ "chars": 1822,
299
+ "seconds": 1.39
300
+ },
301
+ {
302
+ "page": 14,
303
+ "source": "ocr",
304
+ "chars": 1655,
305
+ "seconds": 1.36
306
+ },
307
+ {
308
+ "page": 15,
309
+ "source": "ocr",
310
+ "chars": 1857,
311
+ "seconds": 1.41
312
+ },
313
+ {
314
+ "page": 16,
315
+ "source": "ocr",
316
+ "chars": 1550,
317
+ "seconds": 1.72
318
+ }
319
+ ],
320
+ "total_chars": 6938,
321
+ "seconds": 5.88
322
+ },
323
+ {
324
+ "section": "Faculty Staff",
325
+ "subsection": "Department of Information Systems",
326
+ "scope_label": "general",
327
+ "source_doc": "Complete Handbook",
328
+ "pages": [
329
+ 17,
330
+ 20
331
+ ],
332
+ "page_stats": [
333
+ {
334
+ "page": 17,
335
+ "source": "ocr",
336
+ "chars": 1455,
337
+ "seconds": 2.99
338
+ },
339
+ {
340
+ "page": 18,
341
+ "source": "ocr",
342
+ "chars": 1576,
343
+ "seconds": 1.13
344
+ },
345
+ {
346
+ "page": 19,
347
+ "source": "ocr",
348
+ "chars": 1466,
349
+ "seconds": 0.94
350
+ },
351
+ {
352
+ "page": 20,
353
+ "source": "ocr",
354
+ "chars": 269,
355
+ "seconds": 0.46
356
+ }
357
+ ],
358
+ "total_chars": 4820,
359
+ "seconds": 5.51
360
+ },
361
+ {
362
+ "section": "Postgraduate General Information",
363
+ "subsection": "Legislation and Prescribed Rules",
364
+ "scope_label": "postgraduate",
365
+ "source_doc": "Complete Handbook",
366
+ "pages": [
367
+ 126,
368
+ 126
369
+ ],
370
+ "page_stats": [
371
+ {
372
+ "page": 126,
373
+ "source": "ocr",
374
+ "chars": 979,
375
+ "seconds": 0.8
376
+ }
377
+ ],
378
+ "total_chars": 992,
379
+ "seconds": 0.8
380
+ },
381
+ {
382
+ "section": "Postgraduate General Information",
383
+ "subsection": "Marking Scheme and Grade Point Average (GPA)",
384
+ "scope_label": "postgraduate",
385
+ "source_doc": "Complete Handbook",
386
+ "pages": [
387
+ 127,
388
+ 127
389
+ ],
390
+ "page_stats": [
391
+ {
392
+ "page": 127,
393
+ "source": "ocr",
394
+ "chars": 446,
395
+ "seconds": 0.74
396
+ }
397
+ ],
398
+ "total_chars": 459,
399
+ "seconds": 0.74
400
+ },
401
+ {
402
+ "section": "Research Guidance",
403
+ "subsection": "Progress Report",
404
+ "scope_label": "postgraduate",
405
+ "source_doc": "Complete Handbook",
406
+ "pages": [
407
+ 129,
408
+ 129
409
+ ],
410
+ "page_stats": [
411
+ {
412
+ "page": 129,
413
+ "source": "ocr",
414
+ "chars": 640,
415
+ "seconds": 0.58
416
+ }
417
+ ],
418
+ "total_chars": 653,
419
+ "seconds": 0.58
420
+ },
421
+ {
422
+ "section": "Research Guidance",
423
+ "subsection": "Supervision Policy for Postgraduate Programmes",
424
+ "scope_label": "postgraduate",
425
+ "source_doc": "Complete Handbook",
426
+ "pages": [
427
+ 130,
428
+ 137
429
+ ],
430
+ "page_stats": [
431
+ {
432
+ "page": 130,
433
+ "source": "ocr",
434
+ "chars": 1555,
435
+ "seconds": 1.4
436
+ },
437
+ {
438
+ "page": 131,
439
+ "source": "ocr",
440
+ "chars": 2554,
441
+ "seconds": 1.85
442
+ },
443
+ {
444
+ "page": 132,
445
+ "source": "ocr",
446
+ "chars": 1256,
447
+ "seconds": 1.05
448
+ },
449
+ {
450
+ "page": 133,
451
+ "source": "ocr",
452
+ "chars": 347,
453
+ "seconds": 0.49
454
+ },
455
+ {
456
+ "page": 134,
457
+ "source": "ocr",
458
+ "chars": 2299,
459
+ "seconds": 1.63
460
+ },
461
+ {
462
+ "page": 135,
463
+ "source": "ocr",
464
+ "chars": 2336,
465
+ "seconds": 1.78
466
+ },
467
+ {
468
+ "page": 136,
469
+ "source": "ocr",
470
+ "chars": 1592,
471
+ "seconds": 1.22
472
+ },
473
+ {
474
+ "page": 137,
475
+ "source": "ocr",
476
+ "chars": 587,
477
+ "seconds": 0.85
478
+ }
479
+ ],
480
+ "total_chars": 12644,
481
+ "seconds": 10.26
482
+ },
483
+ {
484
+ "section": "Research Guidance",
485
+ "subsection": "Thesis Preparation Guidelines",
486
+ "scope_label": "postgraduate",
487
+ "source_doc": "Complete Handbook",
488
+ "pages": [
489
+ 138,
490
+ 171
491
+ ],
492
+ "page_stats": [
493
+ {
494
+ "page": 138,
495
+ "source": "ocr",
496
+ "chars": 1525,
497
+ "seconds": 1.2
498
+ },
499
+ {
500
+ "page": 139,
501
+ "source": "ocr",
502
+ "chars": 1389,
503
+ "seconds": 1.03
504
+ },
505
+ {
506
+ "page": 140,
507
+ "source": "ocr",
508
+ "chars": 2853,
509
+ "seconds": 1.68
510
+ },
511
+ {
512
+ "page": 141,
513
+ "source": "ocr",
514
+ "chars": 778,
515
+ "seconds": 0.71
516
+ },
517
+ {
518
+ "page": 142,
519
+ "source": "ocr",
520
+ "chars": 2585,
521
+ "seconds": 1.66
522
+ },
523
+ {
524
+ "page": 143,
525
+ "source": "ocr",
526
+ "chars": 2168,
527
+ "seconds": 2.02
528
+ },
529
+ {
530
+ "page": 144,
531
+ "source": "ocr",
532
+ "chars": 1149,
533
+ "seconds": 1.0
534
+ },
535
+ {
536
+ "page": 145,
537
+ "source": "ocr",
538
+ "chars": 705,
539
+ "seconds": 0.85
540
+ },
541
+ {
542
+ "page": 146,
543
+ "source": "ocr",
544
+ "chars": 1735,
545
+ "seconds": 1.26
546
+ },
547
+ {
548
+ "page": 147,
549
+ "source": "ocr",
550
+ "chars": 1045,
551
+ "seconds": 0.8
552
+ },
553
+ {
554
+ "page": 148,
555
+ "source": "ocr",
556
+ "chars": 750,
557
+ "seconds": 0.73
558
+ },
559
+ {
560
+ "page": 149,
561
+ "source": "ocr",
562
+ "chars": 1195,
563
+ "seconds": 1.39
564
+ },
565
+ {
566
+ "page": 150,
567
+ "source": "ocr",
568
+ "chars": 95,
569
+ "seconds": 0.47
570
+ },
571
+ {
572
+ "page": 151,
573
+ "source": "ocr",
574
+ "chars": 1446,
575
+ "seconds": 1.0
576
+ },
577
+ {
578
+ "page": 152,
579
+ "source": "ocr",
580
+ "chars": 2698,
581
+ "seconds": 1.78
582
+ },
583
+ {
584
+ "page": 153,
585
+ "source": "ocr",
586
+ "chars": 1588,
587
+ "seconds": 1.54
588
+ },
589
+ {
590
+ "page": 154,
591
+ "source": "ocr",
592
+ "chars": 2154,
593
+ "seconds": 1.46
594
+ },
595
+ {
596
+ "page": 155,
597
+ "source": "ocr",
598
+ "chars": 2084,
599
+ "seconds": 1.37
600
+ },
601
+ {
602
+ "page": 156,
603
+ "source": "ocr",
604
+ "chars": 1279,
605
+ "seconds": 0.94
606
+ },
607
+ {
608
+ "page": 157,
609
+ "source": "ocr",
610
+ "chars": 1991,
611
+ "seconds": 1.37
612
+ },
613
+ {
614
+ "page": 158,
615
+ "source": "ocr",
616
+ "chars": 2460,
617
+ "seconds": 1.64
618
+ },
619
+ {
620
+ "page": 159,
621
+ "source": "ocr",
622
+ "chars": 1064,
623
+ "seconds": 1.05
624
+ },
625
+ {
626
+ "page": 160,
627
+ "source": "ocr",
628
+ "chars": 1916,
629
+ "seconds": 1.25
630
+ },
631
+ {
632
+ "page": 161,
633
+ "source": "ocr",
634
+ "chars": 1922,
635
+ "seconds": 1.28
636
+ },
637
+ {
638
+ "page": 162,
639
+ "source": "ocr",
640
+ "chars": 1451,
641
+ "seconds": 1.06
642
+ },
643
+ {
644
+ "page": 163,
645
+ "source": "ocr",
646
+ "chars": 984,
647
+ "seconds": 0.74
648
+ },
649
+ {
650
+ "page": 164,
651
+ "source": "ocr",
652
+ "chars": 189,
653
+ "seconds": 0.47
654
+ },
655
+ {
656
+ "page": 165,
657
+ "source": "ocr",
658
+ "chars": 519,
659
+ "seconds": 0.82
660
+ },
661
+ {
662
+ "page": 166,
663
+ "source": "ocr",
664
+ "chars": 343,
665
+ "seconds": 1.04
666
+ },
667
+ {
668
+ "page": 167,
669
+ "source": "ocr",
670
+ "chars": 1223,
671
+ "seconds": 1.89
672
+ },
673
+ {
674
+ "page": 168,
675
+ "source": "ocr",
676
+ "chars": 1905,
677
+ "seconds": 1.21
678
+ },
679
+ {
680
+ "page": 169,
681
+ "source": "ocr",
682
+ "chars": 2290,
683
+ "seconds": 2.18
684
+ },
685
+ {
686
+ "page": 170,
687
+ "source": "ocr",
688
+ "chars": 477,
689
+ "seconds": 0.47
690
+ },
691
+ {
692
+ "page": 171,
693
+ "source": "ocr",
694
+ "chars": 1465,
695
+ "seconds": 1.03
696
+ }
697
+ ],
698
+ "total_chars": 49928,
699
+ "seconds": 40.36
700
+ },
701
+ {
702
+ "section": "Research Guidance",
703
+ "subsection": "Thesis or Dissertation Submission and Examinations",
704
+ "scope_label": "postgraduate",
705
+ "source_doc": "Complete Handbook",
706
+ "pages": [
707
+ 172,
708
+ 172
709
+ ],
710
+ "page_stats": [
711
+ {
712
+ "page": 172,
713
+ "source": "native_filtered",
714
+ "chars": 0,
715
+ "seconds": 0.33
716
+ }
717
+ ],
718
+ "total_chars": 0,
719
+ "seconds": 0.33
720
+ },
721
+ {
722
+ "section": "Research Guidance",
723
+ "subsection": "Publication Requirement",
724
+ "scope_label": "postgraduate",
725
+ "source_doc": "Complete Handbook",
726
+ "pages": [
727
+ 173,
728
+ 175
729
+ ],
730
+ "page_stats": [
731
+ {
732
+ "page": 173,
733
+ "source": "ocr",
734
+ "chars": 1761,
735
+ "seconds": 1.73
736
+ },
737
+ {
738
+ "page": 174,
739
+ "source": "ocr",
740
+ "chars": 1759,
741
+ "seconds": 1.37
742
+ },
743
+ {
744
+ "page": 175,
745
+ "source": "ocr",
746
+ "chars": 2295,
747
+ "seconds": 1.58
748
+ }
749
+ ],
750
+ "total_chars": 5858,
751
+ "seconds": 4.68
752
+ },
753
+ {
754
+ "section": "Research Guidance",
755
+ "subsection": "Plagiarism",
756
+ "scope_label": "postgraduate",
757
+ "source_doc": "Complete Handbook",
758
+ "pages": [
759
+ 176,
760
+ 176
761
+ ],
762
+ "page_stats": [
763
+ {
764
+ "page": 176,
765
+ "source": "ocr",
766
+ "chars": 620,
767
+ "seconds": 0.87
768
+ }
769
+ ],
770
+ "total_chars": 633,
771
+ "seconds": 0.87
772
+ },
773
+ {
774
+ "section": "Research Guidance",
775
+ "subsection": "Intellectual Property",
776
+ "scope_label": "postgraduate",
777
+ "source_doc": "Complete Handbook",
778
+ "pages": [
779
+ 177,
780
+ 177
781
+ ],
782
+ "page_stats": [
783
+ {
784
+ "page": 177,
785
+ "source": "ocr",
786
+ "chars": 587,
787
+ "seconds": 0.71
788
+ }
789
+ ],
790
+ "total_chars": 600,
791
+ "seconds": 0.71
792
+ },
793
+ {
794
+ "section": "Research Guidance",
795
+ "subsection": "Postgraduate Activities",
796
+ "scope_label": "postgraduate",
797
+ "source_doc": "Complete Handbook",
798
+ "pages": [
799
+ 178,
800
+ 181
801
+ ],
802
+ "page_stats": [
803
+ {
804
+ "page": 178,
805
+ "source": "native_filtered",
806
+ "chars": 0,
807
+ "seconds": 0.85
808
+ },
809
+ {
810
+ "page": 179,
811
+ "source": "ocr_filtered",
812
+ "chars": 0,
813
+ "seconds": 0.53
814
+ },
815
+ {
816
+ "page": 180,
817
+ "source": "ocr",
818
+ "chars": 95,
819
+ "seconds": 0.57
820
+ },
821
+ {
822
+ "page": 181,
823
+ "source": "ocr_filtered",
824
+ "chars": 0,
825
+ "seconds": 0.61
826
+ }
827
+ ],
828
+ "total_chars": 108,
829
+ "seconds": 2.56
830
+ },
831
+ {
832
+ "section": "Laboratory Regulations and Support",
833
+ "subsection": "Laboratory Regulations",
834
+ "scope_label": "general",
835
+ "source_doc": "Complete Handbook",
836
+ "pages": [
837
+ 183,
838
+ 183
839
+ ],
840
+ "page_stats": [
841
+ {
842
+ "page": 183,
843
+ "source": "ocr",
844
+ "chars": 1950,
845
+ "seconds": 1.42
846
+ }
847
+ ],
848
+ "total_chars": 1963,
849
+ "seconds": 1.42
850
+ },
851
+ {
852
+ "section": "Laboratory Regulations and Support",
853
+ "subsection": "Technical Problem Enquiries",
854
+ "scope_label": "general",
855
+ "source_doc": "Complete Handbook",
856
+ "pages": [
857
+ 184,
858
+ 184
859
+ ],
860
+ "page_stats": [
861
+ {
862
+ "page": 184,
863
+ "source": "ocr",
864
+ "chars": 791,
865
+ "seconds": 0.83
866
+ }
867
+ ],
868
+ "total_chars": 804,
869
+ "seconds": 0.83
870
+ },
871
+ {
872
+ "section": "Undergraduate Faculty Identity",
873
+ "subsection": "Vision and Mission",
874
+ "scope_label": "undergraduate",
875
+ "source_doc": "Complete Handbook",
876
+ "pages": [
877
+ 187,
878
+ 187
879
+ ],
880
+ "page_stats": [
881
+ {
882
+ "page": 187,
883
+ "source": "native",
884
+ "chars": 590,
885
+ "seconds": 0.01
886
+ }
887
+ ],
888
+ "total_chars": 603,
889
+ "seconds": 0.01
890
+ },
891
+ {
892
+ "section": "Faculty Staff",
893
+ "subsection": "Undergraduate Dean's Office and Department Leadership",
894
+ "scope_label": "general",
895
+ "source_doc": "Complete Handbook",
896
+ "pages": [
897
+ 192,
898
+ 199
899
+ ],
900
+ "page_stats": [
901
+ {
902
+ "page": 192,
903
+ "source": "native",
904
+ "chars": 1223,
905
+ "seconds": 0.01
906
+ },
907
+ {
908
+ "page": 193,
909
+ "source": "native",
910
+ "chars": 1100,
911
+ "seconds": 0.01
912
+ },
913
+ {
914
+ "page": 194,
915
+ "source": "native",
916
+ "chars": 403,
917
+ "seconds": 0.0
918
+ },
919
+ {
920
+ "page": 195,
921
+ "source": "native",
922
+ "chars": 1118,
923
+ "seconds": 0.0
924
+ },
925
+ {
926
+ "page": 196,
927
+ "source": "native",
928
+ "chars": 1263,
929
+ "seconds": 0.01
930
+ },
931
+ {
932
+ "page": 197,
933
+ "source": "native",
934
+ "chars": 1493,
935
+ "seconds": 0.01
936
+ },
937
+ {
938
+ "page": 198,
939
+ "source": "native",
940
+ "chars": 799,
941
+ "seconds": 0.0
942
+ },
943
+ {
944
+ "page": 199,
945
+ "source": "native",
946
+ "chars": 953,
947
+ "seconds": 0.01
948
+ }
949
+ ],
950
+ "total_chars": 8470,
951
+ "seconds": 0.04
952
+ },
953
+ {
954
+ "section": "Undergraduate Programmes",
955
+ "subsection": "Programmes Offered",
956
+ "scope_label": "undergraduate",
957
+ "source_doc": "Complete Handbook",
958
+ "pages": [
959
+ 200,
960
+ 200
961
+ ],
962
+ "page_stats": [
963
+ {
964
+ "page": 200,
965
+ "source": "native",
966
+ "chars": 419,
967
+ "seconds": 0.0
968
+ }
969
+ ],
970
+ "total_chars": 432,
971
+ "seconds": 0.0
972
+ },
973
+ {
974
+ "section": "Shared Undergraduate Curriculum",
975
+ "subsection": "University Courses",
976
+ "scope_label": "undergraduate",
977
+ "source_doc": "Complete Handbook",
978
+ "pages": [
979
+ 225,
980
+ 227
981
+ ],
982
+ "page_stats": [
983
+ {
984
+ "page": 225,
985
+ "source": "ocr_filtered",
986
+ "chars": 0,
987
+ "seconds": 0.3
988
+ },
989
+ {
990
+ "page": 226,
991
+ "source": "ocr",
992
+ "chars": 3602,
993
+ "seconds": 2.23
994
+ },
995
+ {
996
+ "page": 227,
997
+ "source": "ocr",
998
+ "chars": 383,
999
+ "seconds": 0.47
1000
+ }
1001
+ ],
1002
+ "total_chars": 4013,
1003
+ "seconds": 3.0
1004
+ },
1005
+ {
1006
+ "section": "Shared Undergraduate Curriculum",
1007
+ "subsection": "Faculty Core Courses",
1008
+ "scope_label": "undergraduate",
1009
+ "source_doc": "Complete Handbook",
1010
+ "pages": [
1011
+ 228,
1012
+ 230
1013
+ ],
1014
+ "page_stats": [
1015
+ {
1016
+ "page": 228,
1017
+ "source": "ocr_filtered",
1018
+ "chars": 0,
1019
+ "seconds": 0.33
1020
+ },
1021
+ {
1022
+ "page": 229,
1023
+ "source": "ocr",
1024
+ "chars": 3495,
1025
+ "seconds": 2.26
1026
+ },
1027
+ {
1028
+ "page": 230,
1029
+ "source": "ocr",
1030
+ "chars": 940,
1031
+ "seconds": 0.74
1032
+ }
1033
+ ],
1034
+ "total_chars": 4463,
1035
+ "seconds": 3.34
1036
+ },
1037
+ {
1038
+ "section": "Shared Undergraduate Curriculum",
1039
+ "subsection": "Programme Core Courses",
1040
+ "scope_label": "undergraduate",
1041
+ "source_doc": "Complete Handbook",
1042
+ "pages": [
1043
+ 231,
1044
+ 239
1045
+ ],
1046
+ "page_stats": [
1047
+ {
1048
+ "page": 231,
1049
+ "source": "native",
1050
+ "chars": 354,
1051
+ "seconds": 0.0
1052
+ },
1053
+ {
1054
+ "page": 232,
1055
+ "source": "ocr",
1056
+ "chars": 3401,
1057
+ "seconds": 2.1
1058
+ },
1059
+ {
1060
+ "page": 233,
1061
+ "source": "ocr",
1062
+ "chars": 3656,
1063
+ "seconds": 3.14
1064
+ },
1065
+ {
1066
+ "page": 234,
1067
+ "source": "ocr",
1068
+ "chars": 3204,
1069
+ "seconds": 2.7
1070
+ },
1071
+ {
1072
+ "page": 235,
1073
+ "source": "ocr",
1074
+ "chars": 3661,
1075
+ "seconds": 2.85
1076
+ },
1077
+ {
1078
+ "page": 236,
1079
+ "source": "ocr",
1080
+ "chars": 3604,
1081
+ "seconds": 2.67
1082
+ },
1083
+ {
1084
+ "page": 237,
1085
+ "source": "ocr",
1086
+ "chars": 3105,
1087
+ "seconds": 2.63
1088
+ },
1089
+ {
1090
+ "page": 238,
1091
+ "source": "ocr",
1092
+ "chars": 3197,
1093
+ "seconds": 2.38
1094
+ },
1095
+ {
1096
+ "page": 239,
1097
+ "source": "ocr",
1098
+ "chars": 2244,
1099
+ "seconds": 2.0
1100
+ }
1101
+ ],
1102
+ "total_chars": 26559,
1103
+ "seconds": 20.48
1104
+ },
1105
+ {
1106
+ "section": "Shared Undergraduate Curriculum",
1107
+ "subsection": "Specialization Elective Courses - Computer System and Network",
1108
+ "scope_label": "undergraduate",
1109
+ "source_doc": "Complete Handbook",
1110
+ "pages": [
1111
+ 240,
1112
+ 244
1113
+ ],
1114
+ "page_stats": [
1115
+ {
1116
+ "page": 240,
1117
+ "source": "ocr",
1118
+ "chars": 87,
1119
+ "seconds": 0.56
1120
+ },
1121
+ {
1122
+ "page": 241,
1123
+ "source": "ocr",
1124
+ "chars": 3637,
1125
+ "seconds": 2.72
1126
+ },
1127
+ {
1128
+ "page": 242,
1129
+ "source": "ocr",
1130
+ "chars": 3475,
1131
+ "seconds": 2.42
1132
+ },
1133
+ {
1134
+ "page": 243,
1135
+ "source": "ocr",
1136
+ "chars": 2926,
1137
+ "seconds": 2.62
1138
+ },
1139
+ {
1140
+ "page": 244,
1141
+ "source": "ocr",
1142
+ "chars": 2290,
1143
+ "seconds": 1.72
1144
+ }
1145
+ ],
1146
+ "total_chars": 12488,
1147
+ "seconds": 10.04
1148
+ },
1149
+ {
1150
+ "section": "Shared Undergraduate Curriculum",
1151
+ "subsection": "Specialization Elective Courses - Artificial Intelligence",
1152
+ "scope_label": "undergraduate",
1153
+ "source_doc": "Complete Handbook",
1154
+ "pages": [
1155
+ 245,
1156
+ 249
1157
+ ],
1158
+ "page_stats": [
1159
+ {
1160
+ "page": 245,
1161
+ "source": "ocr",
1162
+ "chars": 86,
1163
+ "seconds": 0.53
1164
+ },
1165
+ {
1166
+ "page": 246,
1167
+ "source": "ocr",
1168
+ "chars": 3239,
1169
+ "seconds": 2.28
1170
+ },
1171
+ {
1172
+ "page": 247,
1173
+ "source": "ocr",
1174
+ "chars": 3267,
1175
+ "seconds": 2.35
1176
+ },
1177
+ {
1178
+ "page": 248,
1179
+ "source": "ocr",
1180
+ "chars": 3545,
1181
+ "seconds": 2.23
1182
+ },
1183
+ {
1184
+ "page": 249,
1185
+ "source": "ocr",
1186
+ "chars": 2485,
1187
+ "seconds": 2.34
1188
+ }
1189
+ ],
1190
+ "total_chars": 12695,
1191
+ "seconds": 9.73
1192
+ },
1193
+ {
1194
+ "section": "Shared Undergraduate Curriculum",
1195
+ "subsection": "Specialization Elective Courses - Information Systems",
1196
+ "scope_label": "undergraduate",
1197
+ "source_doc": "Complete Handbook",
1198
+ "pages": [
1199
+ 250,
1200
+ 254
1201
+ ],
1202
+ "page_stats": [
1203
+ {
1204
+ "page": 250,
1205
+ "source": "ocr",
1206
+ "chars": 78,
1207
+ "seconds": 0.62
1208
+ },
1209
+ {
1210
+ "page": 251,
1211
+ "source": "ocr",
1212
+ "chars": 3833,
1213
+ "seconds": 2.52
1214
+ },
1215
+ {
1216
+ "page": 252,
1217
+ "source": "ocr",
1218
+ "chars": 3880,
1219
+ "seconds": 2.87
1220
+ },
1221
+ {
1222
+ "page": 253,
1223
+ "source": "ocr",
1224
+ "chars": 3654,
1225
+ "seconds": 2.41
1226
+ },
1227
+ {
1228
+ "page": 254,
1229
+ "source": "ocr",
1230
+ "chars": 1627,
1231
+ "seconds": 1.21
1232
+ }
1233
+ ],
1234
+ "total_chars": 13145,
1235
+ "seconds": 9.63
1236
+ },
1237
+ {
1238
+ "section": "Shared Undergraduate Curriculum",
1239
+ "subsection": "Specialization Elective Courses - Software Engineering",
1240
+ "scope_label": "undergraduate",
1241
+ "source_doc": "Complete Handbook",
1242
+ "pages": [
1243
+ 255,
1244
+ 259
1245
+ ],
1246
+ "page_stats": [
1247
+ {
1248
+ "page": 255,
1249
+ "source": "ocr",
1250
+ "chars": 76,
1251
+ "seconds": 0.44
1252
+ },
1253
+ {
1254
+ "page": 256,
1255
+ "source": "ocr",
1256
+ "chars": 3329,
1257
+ "seconds": 3.96
1258
+ },
1259
+ {
1260
+ "page": 257,
1261
+ "source": "ocr",
1262
+ "chars": 3533,
1263
+ "seconds": 3.88
1264
+ },
1265
+ {
1266
+ "page": 258,
1267
+ "source": "ocr",
1268
+ "chars": 3705,
1269
+ "seconds": 3.73
1270
+ },
1271
+ {
1272
+ "page": 259,
1273
+ "source": "ocr",
1274
+ "chars": 3259,
1275
+ "seconds": 2.37
1276
+ }
1277
+ ],
1278
+ "total_chars": 13975,
1279
+ "seconds": 14.38
1280
+ },
1281
+ {
1282
+ "section": "Shared Undergraduate Curriculum",
1283
+ "subsection": "Specialization Elective Courses - Multimedia Computing",
1284
+ "scope_label": "undergraduate",
1285
+ "source_doc": "Complete Handbook",
1286
+ "pages": [
1287
+ 260,
1288
+ 264
1289
+ ],
1290
+ "page_stats": [
1291
+ {
1292
+ "page": 260,
1293
+ "source": "ocr",
1294
+ "chars": 84,
1295
+ "seconds": 0.5
1296
+ },
1297
+ {
1298
+ "page": 261,
1299
+ "source": "ocr",
1300
+ "chars": 3062,
1301
+ "seconds": 1.9
1302
+ },
1303
+ {
1304
+ "page": 262,
1305
+ "source": "ocr",
1306
+ "chars": 3265,
1307
+ "seconds": 2.59
1308
+ },
1309
+ {
1310
+ "page": 263,
1311
+ "source": "ocr",
1312
+ "chars": 3556,
1313
+ "seconds": 2.82
1314
+ },
1315
+ {
1316
+ "page": 264,
1317
+ "source": "ocr",
1318
+ "chars": 2773,
1319
+ "seconds": 2.47
1320
+ }
1321
+ ],
1322
+ "total_chars": 12813,
1323
+ "seconds": 10.28
1324
+ },
1325
+ {
1326
+ "section": "Shared Undergraduate Curriculum",
1327
+ "subsection": "Specialization Elective Courses - Data Science",
1328
+ "scope_label": "undergraduate",
1329
+ "source_doc": "Complete Handbook",
1330
+ "pages": [
1331
+ 265,
1332
+ 268
1333
+ ],
1334
+ "page_stats": [
1335
+ {
1336
+ "page": 265,
1337
+ "source": "ocr",
1338
+ "chars": 70,
1339
+ "seconds": 0.52
1340
+ },
1341
+ {
1342
+ "page": 266,
1343
+ "source": "ocr",
1344
+ "chars": 3197,
1345
+ "seconds": 2.39
1346
+ },
1347
+ {
1348
+ "page": 267,
1349
+ "source": "ocr",
1350
+ "chars": 3047,
1351
+ "seconds": 3.97
1352
+ },
1353
+ {
1354
+ "page": 268,
1355
+ "source": "ocr",
1356
+ "chars": 1485,
1357
+ "seconds": 2.52
1358
+ }
1359
+ ],
1360
+ "total_chars": 7857,
1361
+ "seconds": 9.4
1362
+ },
1363
+ {
1364
+ "section": "Industrial Training",
1365
+ "subsection": "Industrial Training Guidelines",
1366
+ "scope_label": "undergraduate",
1367
+ "source_doc": "Complete Handbook",
1368
+ "pages": [
1369
+ 270,
1370
+ 280
1371
+ ],
1372
+ "page_stats": [
1373
+ {
1374
+ "page": 270,
1375
+ "source": "native",
1376
+ "chars": 2514,
1377
+ "seconds": 0.01
1378
+ },
1379
+ {
1380
+ "page": 271,
1381
+ "source": "native",
1382
+ "chars": 2402,
1383
+ "seconds": 0.01
1384
+ },
1385
+ {
1386
+ "page": 272,
1387
+ "source": "native",
1388
+ "chars": 2948,
1389
+ "seconds": 0.01
1390
+ },
1391
+ {
1392
+ "page": 273,
1393
+ "source": "native",
1394
+ "chars": 3534,
1395
+ "seconds": 0.01
1396
+ },
1397
+ {
1398
+ "page": 274,
1399
+ "source": "native",
1400
+ "chars": 703,
1401
+ "seconds": 0.0
1402
+ },
1403
+ {
1404
+ "page": 275,
1405
+ "source": "ocr",
1406
+ "chars": 776,
1407
+ "seconds": 0.97
1408
+ },
1409
+ {
1410
+ "page": 276,
1411
+ "source": "native",
1412
+ "chars": 178,
1413
+ "seconds": 0.01
1414
+ },
1415
+ {
1416
+ "page": 277,
1417
+ "source": "native",
1418
+ "chars": 2139,
1419
+ "seconds": 0.01
1420
+ },
1421
+ {
1422
+ "page": 278,
1423
+ "source": "native",
1424
+ "chars": 2934,
1425
+ "seconds": 0.01
1426
+ },
1427
+ {
1428
+ "page": 279,
1429
+ "source": "native",
1430
+ "chars": 1523,
1431
+ "seconds": 0.01
1432
+ },
1433
+ {
1434
+ "page": 280,
1435
+ "source": "native",
1436
+ "chars": 1754,
1437
+ "seconds": 0.01
1438
+ }
1439
+ ],
1440
+ "total_chars": 21568,
1441
+ "seconds": 1.07
1442
+ },
1443
+ {
1444
+ "section": "Academic Project",
1445
+ "subsection": "Academic Project I and II Guidelines",
1446
+ "scope_label": "undergraduate",
1447
+ "source_doc": "Complete Handbook",
1448
+ "pages": [
1449
+ 282,
1450
+ 289
1451
+ ],
1452
+ "page_stats": [
1453
+ {
1454
+ "page": 282,
1455
+ "source": "native",
1456
+ "chars": 1706,
1457
+ "seconds": 0.01
1458
+ },
1459
+ {
1460
+ "page": 283,
1461
+ "source": "native",
1462
+ "chars": 1708,
1463
+ "seconds": 0.01
1464
+ },
1465
+ {
1466
+ "page": 284,
1467
+ "source": "native",
1468
+ "chars": 1461,
1469
+ "seconds": 0.01
1470
+ },
1471
+ {
1472
+ "page": 285,
1473
+ "source": "native",
1474
+ "chars": 548,
1475
+ "seconds": 0.01
1476
+ },
1477
+ {
1478
+ "page": 286,
1479
+ "source": "native",
1480
+ "chars": 1732,
1481
+ "seconds": 0.01
1482
+ },
1483
+ {
1484
+ "page": 287,
1485
+ "source": "native",
1486
+ "chars": 1579,
1487
+ "seconds": 0.01
1488
+ },
1489
+ {
1490
+ "page": 288,
1491
+ "source": "native",
1492
+ "chars": 1302,
1493
+ "seconds": 0.01
1494
+ },
1495
+ {
1496
+ "page": 289,
1497
+ "source": "native",
1498
+ "chars": 672,
1499
+ "seconds": 0.01
1500
+ }
1501
+ ],
1502
+ "total_chars": 10826,
1503
+ "seconds": 0.09
1504
+ },
1505
+ {
1506
+ "section": "Language Path and English Communication",
1507
+ "subsection": "Language Path Course / English Communication Programme 2025/2026",
1508
+ "scope_label": "undergraduate",
1509
+ "source_doc": "Complete Handbook",
1510
+ "pages": [
1511
+ 292,
1512
+ 296
1513
+ ],
1514
+ "page_stats": [
1515
+ {
1516
+ "page": 292,
1517
+ "source": "native_filtered",
1518
+ "chars": 0,
1519
+ "seconds": 0.77
1520
+ },
1521
+ {
1522
+ "page": 293,
1523
+ "source": "ocr",
1524
+ "chars": 2721,
1525
+ "seconds": 3.34
1526
+ },
1527
+ {
1528
+ "page": 294,
1529
+ "source": "ocr",
1530
+ "chars": 1698,
1531
+ "seconds": 2.07
1532
+ },
1533
+ {
1534
+ "page": 295,
1535
+ "source": "ocr",
1536
+ "chars": 2450,
1537
+ "seconds": 2.3
1538
+ },
1539
+ {
1540
+ "page": 296,
1541
+ "source": "ocr",
1542
+ "chars": 2045,
1543
+ "seconds": 1.94
1544
+ }
1545
+ ],
1546
+ "total_chars": 8972,
1547
+ "seconds": 10.42
1548
+ },
1549
+ {
1550
+ "section": "Student Dress Code",
1551
+ "subsection": "Dress Code and Appearance Guides for Universiti Malaya Students",
1552
+ "scope_label": "general",
1553
+ "source_doc": "Complete Handbook",
1554
+ "pages": [
1555
+ 297,
1556
+ 298
1557
+ ],
1558
+ "page_stats": [
1559
+ {
1560
+ "page": "297-298",
1561
+ "source": "manual_visual_override",
1562
+ "chars": 1202,
1563
+ "seconds": 0.0
1564
+ }
1565
+ ],
1566
+ "total_chars": 1202,
1567
+ "seconds": 0.0
1568
+ },
1569
+ {
1570
+ "section": "Undergraduate Rules and Regulations",
1571
+ "subsection": "Examination Honesty and Discipline / Undergraduate Rules",
1572
+ "scope_label": "undergraduate",
1573
+ "source_doc": "Complete Handbook",
1574
+ "pages": [
1575
+ 299,
1576
+ 300
1577
+ ],
1578
+ "page_stats": [
1579
+ {
1580
+ "page": 299,
1581
+ "source": "native_filtered",
1582
+ "chars": 0,
1583
+ "seconds": 0.57
1584
+ },
1585
+ {
1586
+ "page": 300,
1587
+ "source": "native",
1588
+ "chars": 838,
1589
+ "seconds": 0.01
1590
+ }
1591
+ ],
1592
+ "total_chars": 851,
1593
+ "seconds": 0.58
1594
+ },
1595
+ {
1596
+ "section": "Examination Grading Scheme",
1597
+ "subsection": "Official University Grades",
1598
+ "scope_label": "undergraduate",
1599
+ "source_doc": "Complete Handbook",
1600
+ "pages": [
1601
+ 301,
1602
+ 301
1603
+ ],
1604
+ "page_stats": [
1605
+ {
1606
+ "page": 301,
1607
+ "source": "ocr",
1608
+ "chars": 481,
1609
+ "seconds": 0.76
1610
+ }
1611
+ ],
1612
+ "total_chars": 494,
1613
+ "seconds": 0.76
1614
+ },
1615
+ {
1616
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1617
+ "subsection": "Bachelor of Computer Science (Computer System and Network)",
1618
+ "scope_label": "undergraduate",
1619
+ "source_doc": "Complete Handbook",
1620
+ "pages": [
1621
+ 202,
1622
+ 204
1623
+ ],
1624
+ "page_stats": [
1625
+ {
1626
+ "page": 202,
1627
+ "source": "native",
1628
+ "chars": 1916,
1629
+ "seconds": 0.01
1630
+ },
1631
+ {
1632
+ "page": 203,
1633
+ "source": "ocr",
1634
+ "chars": 974,
1635
+ "seconds": 1.18
1636
+ },
1637
+ {
1638
+ "page": 204,
1639
+ "source": "ocr",
1640
+ "chars": 2214,
1641
+ "seconds": 2.39
1642
+ }
1643
+ ],
1644
+ "total_chars": 5147,
1645
+ "seconds": 3.58
1646
+ },
1647
+ {
1648
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1649
+ "subsection": "Bachelor of Computer Science (Artificial Intelligence)",
1650
+ "scope_label": "undergraduate",
1651
+ "source_doc": "Complete Handbook",
1652
+ "pages": [
1653
+ 206,
1654
+ 208
1655
+ ],
1656
+ "page_stats": [
1657
+ {
1658
+ "page": 206,
1659
+ "source": "native",
1660
+ "chars": 2037,
1661
+ "seconds": 0.01
1662
+ },
1663
+ {
1664
+ "page": 207,
1665
+ "source": "ocr",
1666
+ "chars": 2403,
1667
+ "seconds": 1.72
1668
+ },
1669
+ {
1670
+ "page": 208,
1671
+ "source": "ocr",
1672
+ "chars": 2362,
1673
+ "seconds": 2.05
1674
+ }
1675
+ ],
1676
+ "total_chars": 6845,
1677
+ "seconds": 3.78
1678
+ },
1679
+ {
1680
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1681
+ "subsection": "Bachelor of Computer Science (Information Systems)",
1682
+ "scope_label": "undergraduate",
1683
+ "source_doc": "Complete Handbook",
1684
+ "pages": [
1685
+ 210,
1686
+ 212
1687
+ ],
1688
+ "page_stats": [
1689
+ {
1690
+ "page": 210,
1691
+ "source": "native",
1692
+ "chars": 2018,
1693
+ "seconds": 0.0
1694
+ },
1695
+ {
1696
+ "page": 211,
1697
+ "source": "ocr",
1698
+ "chars": 2743,
1699
+ "seconds": 1.97
1700
+ },
1701
+ {
1702
+ "page": 212,
1703
+ "source": "ocr",
1704
+ "chars": 2339,
1705
+ "seconds": 1.62
1706
+ }
1707
+ ],
1708
+ "total_chars": 7143,
1709
+ "seconds": 3.59
1710
+ },
1711
+ {
1712
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1713
+ "subsection": "Bachelor of Computer Science (Software Engineering)",
1714
+ "scope_label": "undergraduate",
1715
+ "source_doc": "Complete Handbook",
1716
+ "pages": [
1717
+ 214,
1718
+ 216
1719
+ ],
1720
+ "page_stats": [
1721
+ {
1722
+ "page": 214,
1723
+ "source": "native",
1724
+ "chars": 1948,
1725
+ "seconds": 0.01
1726
+ },
1727
+ {
1728
+ "page": 215,
1729
+ "source": "ocr",
1730
+ "chars": 2867,
1731
+ "seconds": 1.93
1732
+ },
1733
+ {
1734
+ "page": 216,
1735
+ "source": "ocr",
1736
+ "chars": 2170,
1737
+ "seconds": 1.92
1738
+ }
1739
+ ],
1740
+ "total_chars": 7028,
1741
+ "seconds": 3.86
1742
+ },
1743
+ {
1744
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1745
+ "subsection": "Bachelor of Computer Science (Multimedia Computing)",
1746
+ "scope_label": "undergraduate",
1747
+ "source_doc": "Complete Handbook",
1748
+ "pages": [
1749
+ 218,
1750
+ 220
1751
+ ],
1752
+ "page_stats": [
1753
+ {
1754
+ "page": 218,
1755
+ "source": "native",
1756
+ "chars": 2009,
1757
+ "seconds": 0.01
1758
+ },
1759
+ {
1760
+ "page": 219,
1761
+ "source": "ocr",
1762
+ "chars": 2673,
1763
+ "seconds": 2.12
1764
+ },
1765
+ {
1766
+ "page": 220,
1767
+ "source": "ocr",
1768
+ "chars": 2398,
1769
+ "seconds": 2.0
1770
+ }
1771
+ ],
1772
+ "total_chars": 7123,
1773
+ "seconds": 4.13
1774
+ },
1775
+ {
1776
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
1777
+ "subsection": "Bachelor of Computer Science (Data Science)",
1778
+ "scope_label": "undergraduate",
1779
+ "source_doc": "Complete Handbook",
1780
+ "pages": [
1781
+ 222,
1782
+ 224
1783
+ ],
1784
+ "page_stats": [
1785
+ {
1786
+ "page": 222,
1787
+ "source": "native",
1788
+ "chars": 1685,
1789
+ "seconds": 0.01
1790
+ },
1791
+ {
1792
+ "page": 223,
1793
+ "source": "ocr",
1794
+ "chars": 1842,
1795
+ "seconds": 1.61
1796
+ },
1797
+ {
1798
+ "page": 224,
1799
+ "source": "ocr",
1800
+ "chars": 1979,
1801
+ "seconds": 1.7
1802
+ }
1803
+ ],
1804
+ "total_chars": 5549,
1805
+ "seconds": 3.32
1806
+ },
1807
+ {
1808
+ "section": "Master of Computer Science (Applied Computing)",
1809
+ "subsection": "Programme Requirements",
1810
+ "scope_label": "postgraduate",
1811
+ "source_doc": "Complete Handbook",
1812
+ "pages": [
1813
+ 37,
1814
+ 38
1815
+ ],
1816
+ "page_stats": [
1817
+ {
1818
+ "page": 37,
1819
+ "source": "ocr",
1820
+ "chars": 2541,
1821
+ "seconds": 1.54
1822
+ },
1823
+ {
1824
+ "page": 38,
1825
+ "source": "ocr",
1826
+ "chars": 1535,
1827
+ "seconds": 1.09
1828
+ }
1829
+ ],
1830
+ "total_chars": 4102,
1831
+ "seconds": 2.64
1832
+ },
1833
+ {
1834
+ "section": "Master of Computer Science (Applied Computing)",
1835
+ "subsection": "Programme Objectives and Outcomes",
1836
+ "scope_label": "postgraduate",
1837
+ "source_doc": "Complete Handbook",
1838
+ "pages": [
1839
+ 39,
1840
+ 40
1841
+ ],
1842
+ "page_stats": [
1843
+ {
1844
+ "page": 39,
1845
+ "source": "ocr",
1846
+ "chars": 2021,
1847
+ "seconds": 1.55
1848
+ },
1849
+ {
1850
+ "page": 40,
1851
+ "source": "ocr",
1852
+ "chars": 119,
1853
+ "seconds": 0.42
1854
+ }
1855
+ ],
1856
+ "total_chars": 2166,
1857
+ "seconds": 1.97
1858
+ },
1859
+ {
1860
+ "section": "Master of Computer Science (Applied Computing)",
1861
+ "subsection": "Candidature Requirements",
1862
+ "scope_label": "postgraduate",
1863
+ "source_doc": "Complete Handbook",
1864
+ "pages": [
1865
+ 41,
1866
+ 41
1867
+ ],
1868
+ "page_stats": [
1869
+ {
1870
+ "page": 41,
1871
+ "source": "ocr",
1872
+ "chars": 449,
1873
+ "seconds": 0.59
1874
+ }
1875
+ ],
1876
+ "total_chars": 461,
1877
+ "seconds": 0.59
1878
+ },
1879
+ {
1880
+ "section": "Master of Computer Science (Applied Computing)",
1881
+ "subsection": "Graduate on Time (GOT) Schedule",
1882
+ "scope_label": "postgraduate",
1883
+ "source_doc": "Complete Handbook",
1884
+ "pages": [
1885
+ 42,
1886
+ 42
1887
+ ],
1888
+ "page_stats": [
1889
+ {
1890
+ "page": 42,
1891
+ "source": "ocr",
1892
+ "chars": 2485,
1893
+ "seconds": 1.45
1894
+ }
1895
+ ],
1896
+ "total_chars": 2497,
1897
+ "seconds": 1.45
1898
+ },
1899
+ {
1900
+ "section": "Master of Computer Science (Applied Computing)",
1901
+ "subsection": "Course Plan",
1902
+ "scope_label": "postgraduate",
1903
+ "source_doc": "Complete Handbook",
1904
+ "pages": [
1905
+ 43,
1906
+ 44
1907
+ ],
1908
+ "page_stats": [
1909
+ {
1910
+ "page": 43,
1911
+ "source": "ocr",
1912
+ "chars": 1060,
1913
+ "seconds": 0.96
1914
+ },
1915
+ {
1916
+ "page": 44,
1917
+ "source": "ocr",
1918
+ "chars": 1001,
1919
+ "seconds": 0.94
1920
+ }
1921
+ ],
1922
+ "total_chars": 2087,
1923
+ "seconds": 1.9
1924
+ },
1925
+ {
1926
+ "section": "Master of Computer Science (Applied Computing)",
1927
+ "subsection": "List of Courses and Contents",
1928
+ "scope_label": "postgraduate",
1929
+ "source_doc": "Complete Handbook",
1930
+ "pages": [
1931
+ 45,
1932
+ 50
1933
+ ],
1934
+ "page_stats": [
1935
+ {
1936
+ "page": 45,
1937
+ "source": "ocr",
1938
+ "chars": 934,
1939
+ "seconds": 0.71
1940
+ },
1941
+ {
1942
+ "page": 46,
1943
+ "source": "ocr",
1944
+ "chars": 2137,
1945
+ "seconds": 1.73
1946
+ },
1947
+ {
1948
+ "page": 47,
1949
+ "source": "ocr",
1950
+ "chars": 2038,
1951
+ "seconds": 1.52
1952
+ },
1953
+ {
1954
+ "page": 48,
1955
+ "source": "ocr",
1956
+ "chars": 2322,
1957
+ "seconds": 1.73
1958
+ },
1959
+ {
1960
+ "page": 49,
1961
+ "source": "ocr",
1962
+ "chars": 2567,
1963
+ "seconds": 1.7
1964
+ },
1965
+ {
1966
+ "page": 50,
1967
+ "source": "ocr",
1968
+ "chars": 758,
1969
+ "seconds": 0.67
1970
+ }
1971
+ ],
1972
+ "total_chars": 10838,
1973
+ "seconds": 8.06
1974
+ },
1975
+ {
1976
+ "section": "Master of Software Engineering (Software Technology)",
1977
+ "subsection": "Programme Requirements",
1978
+ "scope_label": "postgraduate",
1979
+ "source_doc": "Complete Handbook",
1980
+ "pages": [
1981
+ 52,
1982
+ 53
1983
+ ],
1984
+ "page_stats": [
1985
+ {
1986
+ "page": 52,
1987
+ "source": "ocr",
1988
+ "chars": 2365,
1989
+ "seconds": 1.45
1990
+ },
1991
+ {
1992
+ "page": 53,
1993
+ "source": "ocr",
1994
+ "chars": 1714,
1995
+ "seconds": 1.18
1996
+ }
1997
+ ],
1998
+ "total_chars": 4105,
1999
+ "seconds": 2.63
2000
+ },
2001
+ {
2002
+ "section": "Master of Software Engineering (Software Technology)",
2003
+ "subsection": "Programme Objectives and Outcomes",
2004
+ "scope_label": "postgraduate",
2005
+ "source_doc": "Complete Handbook",
2006
+ "pages": [
2007
+ 54,
2008
+ 55
2009
+ ],
2010
+ "page_stats": [
2011
+ {
2012
+ "page": 54,
2013
+ "source": "ocr",
2014
+ "chars": 1842,
2015
+ "seconds": 1.78
2016
+ },
2017
+ {
2018
+ "page": 55,
2019
+ "source": "ocr",
2020
+ "chars": 434,
2021
+ "seconds": 0.53
2022
+ }
2023
+ ],
2024
+ "total_chars": 2302,
2025
+ "seconds": 2.31
2026
+ },
2027
+ {
2028
+ "section": "Master of Software Engineering (Software Technology)",
2029
+ "subsection": "Candidature Requirements",
2030
+ "scope_label": "postgraduate",
2031
+ "source_doc": "Complete Handbook",
2032
+ "pages": [
2033
+ 56,
2034
+ 56
2035
+ ],
2036
+ "page_stats": [
2037
+ {
2038
+ "page": 56,
2039
+ "source": "ocr",
2040
+ "chars": 528,
2041
+ "seconds": 0.47
2042
+ }
2043
+ ],
2044
+ "total_chars": 540,
2045
+ "seconds": 0.47
2046
+ },
2047
+ {
2048
+ "section": "Master of Software Engineering (Software Technology)",
2049
+ "subsection": "Graduate on Time (GOT) Schedule",
2050
+ "scope_label": "postgraduate",
2051
+ "source_doc": "Complete Handbook",
2052
+ "pages": [
2053
+ 57,
2054
+ 57
2055
+ ],
2056
+ "page_stats": [
2057
+ {
2058
+ "page": 57,
2059
+ "source": "ocr",
2060
+ "chars": 1834,
2061
+ "seconds": 1.49
2062
+ }
2063
+ ],
2064
+ "total_chars": 1846,
2065
+ "seconds": 1.49
2066
+ },
2067
+ {
2068
+ "section": "Master of Software Engineering (Software Technology)",
2069
+ "subsection": "Course Plan",
2070
+ "scope_label": "postgraduate",
2071
+ "source_doc": "Complete Handbook",
2072
+ "pages": [
2073
+ 59,
2074
+ 61
2075
+ ],
2076
+ "page_stats": [
2077
+ {
2078
+ "page": 59,
2079
+ "source": "ocr",
2080
+ "chars": 1178,
2081
+ "seconds": 0.95
2082
+ },
2083
+ {
2084
+ "page": 60,
2085
+ "source": "ocr",
2086
+ "chars": 1207,
2087
+ "seconds": 1.19
2088
+ },
2089
+ {
2090
+ "page": 61,
2091
+ "source": "ocr",
2092
+ "chars": 2744,
2093
+ "seconds": 1.8
2094
+ }
2095
+ ],
2096
+ "total_chars": 5169,
2097
+ "seconds": 3.94
2098
+ },
2099
+ {
2100
+ "section": "Master of Software Engineering (Software Technology)",
2101
+ "subsection": "List of Courses and Contents",
2102
+ "scope_label": "postgraduate",
2103
+ "source_doc": "Complete Handbook",
2104
+ "pages": [
2105
+ 62,
2106
+ 68
2107
+ ],
2108
+ "page_stats": [
2109
+ {
2110
+ "page": 62,
2111
+ "source": "ocr",
2112
+ "chars": 983,
2113
+ "seconds": 0.8
2114
+ },
2115
+ {
2116
+ "page": 63,
2117
+ "source": "ocr",
2118
+ "chars": 2055,
2119
+ "seconds": 1.61
2120
+ },
2121
+ {
2122
+ "page": 64,
2123
+ "source": "ocr",
2124
+ "chars": 1893,
2125
+ "seconds": 1.35
2126
+ },
2127
+ {
2128
+ "page": 65,
2129
+ "source": "ocr",
2130
+ "chars": 2481,
2131
+ "seconds": 2.0
2132
+ },
2133
+ {
2134
+ "page": 66,
2135
+ "source": "ocr",
2136
+ "chars": 1943,
2137
+ "seconds": 1.63
2138
+ },
2139
+ {
2140
+ "page": 67,
2141
+ "source": "ocr",
2142
+ "chars": 2598,
2143
+ "seconds": 1.9
2144
+ },
2145
+ {
2146
+ "page": 68,
2147
+ "source": "ocr",
2148
+ "chars": 1635,
2149
+ "seconds": 1.33
2150
+ }
2151
+ ],
2152
+ "total_chars": 13684,
2153
+ "seconds": 10.63
2154
+ },
2155
+ {
2156
+ "section": "Master in Data Science",
2157
+ "subsection": "Programme Requirements",
2158
+ "scope_label": "postgraduate",
2159
+ "source_doc": "Complete Handbook",
2160
+ "pages": [
2161
+ 70,
2162
+ 71
2163
+ ],
2164
+ "page_stats": [
2165
+ {
2166
+ "page": 70,
2167
+ "source": "ocr",
2168
+ "chars": 1886,
2169
+ "seconds": 1.36
2170
+ },
2171
+ {
2172
+ "page": 71,
2173
+ "source": "ocr",
2174
+ "chars": 670,
2175
+ "seconds": 1.56
2176
+ }
2177
+ ],
2178
+ "total_chars": 2582,
2179
+ "seconds": 2.92
2180
+ },
2181
+ {
2182
+ "section": "Master in Data Science",
2183
+ "subsection": "Programme Objectives and Outcomes",
2184
+ "scope_label": "postgraduate",
2185
+ "source_doc": "Complete Handbook",
2186
+ "pages": [
2187
+ 72,
2188
+ 74
2189
+ ],
2190
+ "page_stats": [
2191
+ {
2192
+ "page": 72,
2193
+ "source": "ocr",
2194
+ "chars": 2078,
2195
+ "seconds": 1.49
2196
+ },
2197
+ {
2198
+ "page": 73,
2199
+ "source": "ocr",
2200
+ "chars": 117,
2201
+ "seconds": 0.35
2202
+ },
2203
+ {
2204
+ "page": 74,
2205
+ "source": "ocr",
2206
+ "chars": 455,
2207
+ "seconds": 0.54
2208
+ }
2209
+ ],
2210
+ "total_chars": 2690,
2211
+ "seconds": 2.38
2212
+ },
2213
+ {
2214
+ "section": "Master in Data Science",
2215
+ "subsection": "Course Plan",
2216
+ "scope_label": "postgraduate",
2217
+ "source_doc": "Complete Handbook",
2218
+ "pages": [
2219
+ 75,
2220
+ 76
2221
+ ],
2222
+ "page_stats": [
2223
+ {
2224
+ "page": 75,
2225
+ "source": "ocr",
2226
+ "chars": 1310,
2227
+ "seconds": 1.14
2228
+ },
2229
+ {
2230
+ "page": 76,
2231
+ "source": "ocr",
2232
+ "chars": 1222,
2233
+ "seconds": 0.97
2234
+ }
2235
+ ],
2236
+ "total_chars": 2558,
2237
+ "seconds": 2.12
2238
+ },
2239
+ {
2240
+ "section": "Master in Data Science",
2241
+ "subsection": "List of Courses and Contents",
2242
+ "scope_label": "postgraduate",
2243
+ "source_doc": "Complete Handbook",
2244
+ "pages": [
2245
+ 77,
2246
+ 82
2247
+ ],
2248
+ "page_stats": [
2249
+ {
2250
+ "page": 77,
2251
+ "source": "ocr",
2252
+ "chars": 985,
2253
+ "seconds": 0.8
2254
+ },
2255
+ {
2256
+ "page": 78,
2257
+ "source": "ocr",
2258
+ "chars": 2765,
2259
+ "seconds": 1.59
2260
+ },
2261
+ {
2262
+ "page": 79,
2263
+ "source": "ocr",
2264
+ "chars": 2612,
2265
+ "seconds": 1.51
2266
+ },
2267
+ {
2268
+ "page": 80,
2269
+ "source": "ocr",
2270
+ "chars": 2480,
2271
+ "seconds": 1.53
2272
+ },
2273
+ {
2274
+ "page": 81,
2275
+ "source": "ocr",
2276
+ "chars": 2256,
2277
+ "seconds": 1.43
2278
+ },
2279
+ {
2280
+ "page": 82,
2281
+ "source": "ocr",
2282
+ "chars": 931,
2283
+ "seconds": 0.71
2284
+ }
2285
+ ],
2286
+ "total_chars": 12111,
2287
+ "seconds": 7.58
2288
+ },
2289
+ {
2290
+ "section": "Master of Cyber Security",
2291
+ "subsection": "Programme Requirements",
2292
+ "scope_label": "postgraduate",
2293
+ "source_doc": "Complete Handbook",
2294
+ "pages": [
2295
+ 84,
2296
+ 86
2297
+ ],
2298
+ "page_stats": [
2299
+ {
2300
+ "page": 84,
2301
+ "source": "ocr",
2302
+ "chars": 1084,
2303
+ "seconds": 0.78
2304
+ },
2305
+ {
2306
+ "page": 85,
2307
+ "source": "ocr",
2308
+ "chars": 2367,
2309
+ "seconds": 1.28
2310
+ },
2311
+ {
2312
+ "page": 86,
2313
+ "source": "ocr",
2314
+ "chars": 476,
2315
+ "seconds": 0.52
2316
+ }
2317
+ ],
2318
+ "total_chars": 3967,
2319
+ "seconds": 2.58
2320
+ },
2321
+ {
2322
+ "section": "Master of Cyber Security",
2323
+ "subsection": "Programme Objectives and Outcomes",
2324
+ "scope_label": "postgraduate",
2325
+ "source_doc": "Complete Handbook",
2326
+ "pages": [
2327
+ 87,
2328
+ 88
2329
+ ],
2330
+ "page_stats": [
2331
+ {
2332
+ "page": 87,
2333
+ "source": "ocr",
2334
+ "chars": 1926,
2335
+ "seconds": 1.22
2336
+ },
2337
+ {
2338
+ "page": 88,
2339
+ "source": "ocr",
2340
+ "chars": 713,
2341
+ "seconds": 0.68
2342
+ }
2343
+ ],
2344
+ "total_chars": 2665,
2345
+ "seconds": 1.91
2346
+ },
2347
+ {
2348
+ "section": "Master of Cyber Security",
2349
+ "subsection": "Course Plan",
2350
+ "scope_label": "postgraduate",
2351
+ "source_doc": "Complete Handbook",
2352
+ "pages": [
2353
+ 89,
2354
+ 90
2355
+ ],
2356
+ "page_stats": [
2357
+ {
2358
+ "page": 89,
2359
+ "source": "ocr",
2360
+ "chars": 1247,
2361
+ "seconds": 1.04
2362
+ },
2363
+ {
2364
+ "page": 90,
2365
+ "source": "ocr",
2366
+ "chars": 1260,
2367
+ "seconds": 0.95
2368
+ }
2369
+ ],
2370
+ "total_chars": 2533,
2371
+ "seconds": 1.99
2372
+ },
2373
+ {
2374
+ "section": "Master of Cyber Security",
2375
+ "subsection": "List of Courses and Contents",
2376
+ "scope_label": "postgraduate",
2377
+ "source_doc": "Complete Handbook",
2378
+ "pages": [
2379
+ 91,
2380
+ 97
2381
+ ],
2382
+ "page_stats": [
2383
+ {
2384
+ "page": 91,
2385
+ "source": "ocr",
2386
+ "chars": 1052,
2387
+ "seconds": 0.79
2388
+ },
2389
+ {
2390
+ "page": 92,
2391
+ "source": "ocr",
2392
+ "chars": 2352,
2393
+ "seconds": 1.46
2394
+ },
2395
+ {
2396
+ "page": 93,
2397
+ "source": "ocr",
2398
+ "chars": 2367,
2399
+ "seconds": 1.34
2400
+ },
2401
+ {
2402
+ "page": 94,
2403
+ "source": "ocr",
2404
+ "chars": 2098,
2405
+ "seconds": 1.53
2406
+ },
2407
+ {
2408
+ "page": 95,
2409
+ "source": "ocr",
2410
+ "chars": 2644,
2411
+ "seconds": 2.9
2412
+ },
2413
+ {
2414
+ "page": 96,
2415
+ "source": "ocr",
2416
+ "chars": 2141,
2417
+ "seconds": 2.39
2418
+ },
2419
+ {
2420
+ "page": 97,
2421
+ "source": "ocr",
2422
+ "chars": 716,
2423
+ "seconds": 1.18
2424
+ }
2425
+ ],
2426
+ "total_chars": 13466,
2427
+ "seconds": 11.6
2428
+ },
2429
+ {
2430
+ "section": "Master of Artificial Intelligence",
2431
+ "subsection": "Programme Requirements",
2432
+ "scope_label": "postgraduate",
2433
+ "source_doc": "Complete Handbook",
2434
+ "pages": [
2435
+ 99,
2436
+ 100
2437
+ ],
2438
+ "page_stats": [
2439
+ {
2440
+ "page": 99,
2441
+ "source": "ocr",
2442
+ "chars": 1426,
2443
+ "seconds": 2.16
2444
+ },
2445
+ {
2446
+ "page": 100,
2447
+ "source": "ocr",
2448
+ "chars": 983,
2449
+ "seconds": 1.14
2450
+ }
2451
+ ],
2452
+ "total_chars": 2436,
2453
+ "seconds": 3.3
2454
+ },
2455
+ {
2456
+ "section": "Master of Artificial Intelligence",
2457
+ "subsection": "Programme Objectives and Outcomes",
2458
+ "scope_label": "postgraduate",
2459
+ "source_doc": "Complete Handbook",
2460
+ "pages": [
2461
+ 101,
2462
+ 102
2463
+ ],
2464
+ "page_stats": [
2465
+ {
2466
+ "page": 101,
2467
+ "source": "ocr",
2468
+ "chars": 1781,
2469
+ "seconds": 2.87
2470
+ },
2471
+ {
2472
+ "page": 102,
2473
+ "source": "ocr",
2474
+ "chars": 587,
2475
+ "seconds": 0.73
2476
+ }
2477
+ ],
2478
+ "total_chars": 2396,
2479
+ "seconds": 3.6
2480
+ },
2481
+ {
2482
+ "section": "Master of Artificial Intelligence",
2483
+ "subsection": "Course Plan",
2484
+ "scope_label": "postgraduate",
2485
+ "source_doc": "Complete Handbook",
2486
+ "pages": [
2487
+ 103,
2488
+ 103
2489
+ ],
2490
+ "page_stats": [
2491
+ {
2492
+ "page": 103,
2493
+ "source": "ocr",
2494
+ "chars": 1442,
2495
+ "seconds": 1.2
2496
+ }
2497
+ ],
2498
+ "total_chars": 1455,
2499
+ "seconds": 1.2
2500
+ },
2501
+ {
2502
+ "section": "Master of Artificial Intelligence",
2503
+ "subsection": "List of Courses and Contents",
2504
+ "scope_label": "postgraduate",
2505
+ "source_doc": "Complete Handbook",
2506
+ "pages": [
2507
+ 104,
2508
+ 111
2509
+ ],
2510
+ "page_stats": [
2511
+ {
2512
+ "page": 104,
2513
+ "source": "ocr",
2514
+ "chars": 1110,
2515
+ "seconds": 0.94
2516
+ },
2517
+ {
2518
+ "page": 105,
2519
+ "source": "ocr",
2520
+ "chars": 2359,
2521
+ "seconds": 1.53
2522
+ },
2523
+ {
2524
+ "page": 106,
2525
+ "source": "ocr",
2526
+ "chars": 2484,
2527
+ "seconds": 1.5
2528
+ },
2529
+ {
2530
+ "page": 107,
2531
+ "source": "ocr",
2532
+ "chars": 1911,
2533
+ "seconds": 1.15
2534
+ },
2535
+ {
2536
+ "page": 108,
2537
+ "source": "ocr",
2538
+ "chars": 2235,
2539
+ "seconds": 1.31
2540
+ },
2541
+ {
2542
+ "page": 109,
2543
+ "source": "ocr",
2544
+ "chars": 2413,
2545
+ "seconds": 1.46
2546
+ },
2547
+ {
2548
+ "page": 110,
2549
+ "source": "ocr",
2550
+ "chars": 1976,
2551
+ "seconds": 1.56
2552
+ },
2553
+ {
2554
+ "page": 111,
2555
+ "source": "ocr",
2556
+ "chars": 1211,
2557
+ "seconds": 0.95
2558
+ }
2559
+ ],
2560
+ "total_chars": 15817,
2561
+ "seconds": 10.39
2562
+ },
2563
+ {
2564
+ "section": "Master of Computer Science (By Research)",
2565
+ "subsection": "Programme Requirements",
2566
+ "scope_label": "postgraduate",
2567
+ "source_doc": "Complete Handbook",
2568
+ "pages": [
2569
+ 113,
2570
+ 113
2571
+ ],
2572
+ "page_stats": [
2573
+ {
2574
+ "page": 113,
2575
+ "source": "ocr",
2576
+ "chars": 1523,
2577
+ "seconds": 1.33
2578
+ }
2579
+ ],
2580
+ "total_chars": 1536,
2581
+ "seconds": 1.33
2582
+ },
2583
+ {
2584
+ "section": "Master of Computer Science (By Research)",
2585
+ "subsection": "Learning Objectives and Outcomes",
2586
+ "scope_label": "postgraduate",
2587
+ "source_doc": "Complete Handbook",
2588
+ "pages": [
2589
+ 114,
2590
+ 115
2591
+ ],
2592
+ "page_stats": [
2593
+ {
2594
+ "page": 114,
2595
+ "source": "ocr",
2596
+ "chars": 1233,
2597
+ "seconds": 1.1
2598
+ },
2599
+ {
2600
+ "page": 115,
2601
+ "source": "ocr",
2602
+ "chars": 500,
2603
+ "seconds": 0.59
2604
+ }
2605
+ ],
2606
+ "total_chars": 1761,
2607
+ "seconds": 1.69
2608
+ },
2609
+ {
2610
+ "section": "Master of Computer Science (By Research)",
2611
+ "subsection": "Candidature Requirements",
2612
+ "scope_label": "postgraduate",
2613
+ "source_doc": "Complete Handbook",
2614
+ "pages": [
2615
+ 116,
2616
+ 116
2617
+ ],
2618
+ "page_stats": [
2619
+ {
2620
+ "page": 116,
2621
+ "source": "ocr",
2622
+ "chars": 1206,
2623
+ "seconds": 0.91
2624
+ }
2625
+ ],
2626
+ "total_chars": 1219,
2627
+ "seconds": 0.91
2628
+ },
2629
+ {
2630
+ "section": "Master of Computer Science (By Research)",
2631
+ "subsection": "Graduate on Time (GOT) Schedule",
2632
+ "scope_label": "postgraduate",
2633
+ "source_doc": "Complete Handbook",
2634
+ "pages": [
2635
+ 117,
2636
+ 117
2637
+ ],
2638
+ "page_stats": [
2639
+ {
2640
+ "page": 117,
2641
+ "source": "ocr",
2642
+ "chars": 2103,
2643
+ "seconds": 1.9
2644
+ }
2645
+ ],
2646
+ "total_chars": 2116,
2647
+ "seconds": 1.9
2648
+ },
2649
+ {
2650
+ "section": "Master of Computer Science (By Research)",
2651
+ "subsection": "Research Methodology / Course Contents",
2652
+ "scope_label": "postgraduate",
2653
+ "source_doc": "Complete Handbook",
2654
+ "pages": [
2655
+ 118,
2656
+ 118
2657
+ ],
2658
+ "page_stats": [
2659
+ {
2660
+ "page": 118,
2661
+ "source": "ocr",
2662
+ "chars": 1765,
2663
+ "seconds": 1.34
2664
+ }
2665
+ ],
2666
+ "total_chars": 1778,
2667
+ "seconds": 1.34
2668
+ },
2669
+ {
2670
+ "section": "Doctor of Philosophy",
2671
+ "subsection": "Advanced Research Methods Course Content",
2672
+ "scope_label": "postgraduate",
2673
+ "source_doc": "Complete Handbook",
2674
+ "pages": [
2675
+ 120,
2676
+ 120
2677
+ ],
2678
+ "page_stats": [
2679
+ {
2680
+ "page": 120,
2681
+ "source": "ocr",
2682
+ "chars": 1210,
2683
+ "seconds": 1.09
2684
+ }
2685
+ ],
2686
+ "total_chars": 1223,
2687
+ "seconds": 1.09
2688
+ },
2689
+ {
2690
+ "section": "Doctor of Philosophy",
2691
+ "subsection": "Programme Education Objectives",
2692
+ "scope_label": "postgraduate",
2693
+ "source_doc": "Complete Handbook",
2694
+ "pages": [
2695
+ 121,
2696
+ 121
2697
+ ],
2698
+ "page_stats": [
2699
+ {
2700
+ "page": 121,
2701
+ "source": "ocr",
2702
+ "chars": 336,
2703
+ "seconds": 0.6
2704
+ }
2705
+ ],
2706
+ "total_chars": 349,
2707
+ "seconds": 0.6
2708
+ },
2709
+ {
2710
+ "section": "Doctor of Philosophy",
2711
+ "subsection": "Learning Outcomes",
2712
+ "scope_label": "postgraduate",
2713
+ "source_doc": "Complete Handbook",
2714
+ "pages": [
2715
+ 122,
2716
+ 122
2717
+ ],
2718
+ "page_stats": [
2719
+ {
2720
+ "page": 122,
2721
+ "source": "ocr",
2722
+ "chars": 588,
2723
+ "seconds": 0.65
2724
+ }
2725
+ ],
2726
+ "total_chars": 601,
2727
+ "seconds": 0.65
2728
+ },
2729
+ {
2730
+ "section": "Doctor of Philosophy",
2731
+ "subsection": "Candidature Requirements",
2732
+ "scope_label": "postgraduate",
2733
+ "source_doc": "Complete Handbook",
2734
+ "pages": [
2735
+ 123,
2736
+ 123
2737
+ ],
2738
+ "page_stats": [
2739
+ {
2740
+ "page": 123,
2741
+ "source": "ocr",
2742
+ "chars": 1535,
2743
+ "seconds": 2.15
2744
+ }
2745
+ ],
2746
+ "total_chars": 1548,
2747
+ "seconds": 2.15
2748
+ },
2749
+ {
2750
+ "section": "Doctor of Philosophy",
2751
+ "subsection": "Proposed Graduate on Time (GOT) Schedule",
2752
+ "scope_label": "postgraduate",
2753
+ "source_doc": "Complete Handbook",
2754
+ "pages": [
2755
+ 124,
2756
+ 124
2757
+ ],
2758
+ "page_stats": [
2759
+ {
2760
+ "page": 124,
2761
+ "source": "ocr",
2762
+ "chars": 2280,
2763
+ "seconds": 1.77
2764
+ }
2765
+ ],
2766
+ "total_chars": 2293,
2767
+ "seconds": 1.77
2768
+ }
2769
+ ],
2770
+ "total_seconds": 313.89
2771
+ }
UM_Handbook/FineTune_QWEN3_UM_Handbook_optimized_1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
UM_Handbook/UM_Handbook_Markdown_Preprocess.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+ import json
4
+ import re
5
+ import time
6
+ from typing import List
7
+
8
+ import fitz
9
+ import pytesseract
10
+ from PIL import Image
11
+
12
+ from um_handbook_config import (
13
+ GENERAL_PDF,
14
+ COMPLETE_PDF,
15
+ GENERAL_BLOCKS,
16
+ COMPLETE_BLOCKS,
17
+ DATA_ROOT,
18
+ MARKDOWN_DIR,
19
+ REPORTS_DIR,
20
+ )
21
+
22
+ PROJECT_DIR = Path(__file__).resolve().parent
23
+
24
+ DATA_ROOT.mkdir(exist_ok=True)
25
+ MARKDOWN_DIR.mkdir(exist_ok=True)
26
+ REPORTS_DIR.mkdir(exist_ok=True)
27
+
28
+ GENERAL_MD = MARKDOWN_DIR / "general_handbook_structured.md"
29
+ COMPLETE_MD = MARKDOWN_DIR / "complete_handbook_structured.md"
30
+ REPORT_PATH = REPORTS_DIR / "um_handbook_markdown_report.json"
31
+
32
+ BAD_PAGE_PATTERNS = [
33
+ r"\bmore info\b",
34
+ r"fsktm[_\.]?um",
35
+ r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
36
+ r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
37
+ r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
38
+ ]
39
+
40
+
41
+ def normalize_whitespace(text: str) -> str:
42
+ text = text.replace("\u00a0", " ").replace("\xad", "")
43
+ text = text.replace("fi", "fi").replace("fl", "fl")
44
+ text = re.sub(r"[ \t]+", " ", text)
45
+ text = re.sub(r"\n{3,}", "\n\n", text)
46
+ return text.strip()
47
+
48
+
49
+ def clean_page_text(text: str) -> str:
50
+ lines = []
51
+ for raw in text.splitlines():
52
+ line = raw.strip()
53
+ if not line:
54
+ continue
55
+ if re.fullmatch(r"\d+", line):
56
+ continue
57
+ if len(line) == 1 and not line.isalnum():
58
+ continue
59
+ lines.append(line)
60
+ text = "\n".join(lines)
61
+ text = re.sub(r"(?m)^\s*[•▪●]\s*", "- ", text)
62
+ text = re.sub(r"(?m)^\s*([a-z])\)\s*", r"- ", text)
63
+ return normalize_whitespace(text)
64
+
65
+
66
+ def ocr_page(page: fitz.Page, zoom: float = 1.5) -> str:
67
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
68
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
69
+ return pytesseract.image_to_string(img)
70
+
71
+
72
+ def looks_like_noise_page(text: str) -> bool:
73
+ t = normalize_whitespace(text)
74
+ if not t:
75
+ return True
76
+
77
+ compact = t.replace("\n", " ")
78
+ alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
79
+
80
+ # real content pages normally have more than a handful of alphabetic words
81
+ if len(alpha_words) < 8:
82
+ return True
83
+
84
+ # low-information cover / contents / banner pages
85
+ for pattern in BAD_PAGE_PATTERNS:
86
+ if re.search(pattern, compact, flags=re.IGNORECASE):
87
+ if len(alpha_words) < 60:
88
+ return True
89
+
90
+ # very short all-caps dividers
91
+ if len(compact) < 120 and compact.upper() == compact and len(alpha_words) < 15:
92
+ return True
93
+
94
+ return False
95
+
96
+
97
+ def extract_page_text(doc: fitz.Document, page_number_1_based: int) -> tuple[str, str]:
98
+ page = doc[page_number_1_based - 1]
99
+
100
+ native = clean_page_text(page.get_text("text"))
101
+ source = "native"
102
+
103
+ need_ocr = len(native) < 120 or looks_like_noise_page(native)
104
+ if need_ocr:
105
+ ocr_text = clean_page_text(ocr_page(page))
106
+ if len(ocr_text) > len(native):
107
+ native = ocr_text
108
+ source = "ocr"
109
+
110
+ if looks_like_noise_page(native):
111
+ return "", f"{source}_filtered"
112
+
113
+ return native, source
114
+
115
+
116
+ def progress_bar(current: int, total: int, width: int = 28) -> str:
117
+ if total <= 0:
118
+ return "[no-progress]"
119
+ filled = int(width * current / total)
120
+ bar = "#" * filled + "-" * (width - filled)
121
+ pct = (current / total) * 100
122
+ return f"[{bar}] {current}/{total} ({pct:5.1f}%)"
123
+
124
+
125
+ def block_to_markdown(doc: fitz.Document, block: dict, block_index: int, total_blocks: int, pdf_label: str) -> tuple[str, dict]:
126
+ start, end = block["pages"]
127
+
128
+ if block.get("manual_text"):
129
+ print()
130
+ print("=" * 90)
131
+ print(f"[BLOCK {block_index}/{total_blocks}] {pdf_label} | {block['section']} :: {block['subsection']} | pages {start}-{end} | MANUAL OVERRIDE")
132
+ print("=" * 90)
133
+ body = normalize_whitespace(block["manual_text"])
134
+ header = (
135
+ f"## {block['section']} :: {block['subsection']}\n\n"
136
+ f"- scope_label: {block['scope_label']}\n"
137
+ f"- source_doc: {block['source_doc']}\n"
138
+ f"- pages: {start}-{end}\n"
139
+ )
140
+ meta = {
141
+ "section": block["section"],
142
+ "subsection": block["subsection"],
143
+ "scope_label": block["scope_label"],
144
+ "source_doc": block["source_doc"],
145
+ "pages": [start, end],
146
+ "page_stats": [{"page": f"{start}-{end}", "source": "manual_visual_override", "chars": len(body), "seconds": 0.0}],
147
+ "total_chars": len(body),
148
+ "seconds": 0.0,
149
+ }
150
+ print(f"[DONE BLOCK] {block['section']} :: {block['subsection']} | MANUAL OVERRIDE")
151
+ return (header + ("\n" + body + "\n" if body else "\n"), meta)
152
+
153
+ pieces: List[str] = []
154
+ page_stats = []
155
+ block_start_time = time.time()
156
+ total_pages = end - start + 1
157
+
158
+ print()
159
+ print("=" * 90)
160
+ print(f"[BLOCK {block_index}/{total_blocks}] {pdf_label} | {block['section']} :: {block['subsection']} | pages {start}-{end}")
161
+ print("=" * 90)
162
+
163
+ for i, p in enumerate(range(start, end + 1), start=1):
164
+ page_start_time = time.time()
165
+ print(f" {progress_bar(i, total_pages)} -> extracting page {p} ... ", end="", flush=True)
166
+
167
+ text, source = extract_page_text(doc, p)
168
+ elapsed = time.time() - page_start_time
169
+
170
+ if text:
171
+ pieces.append(f"### Page {p}\n{text}")
172
+
173
+ page_stats.append(
174
+ {
175
+ "page": p,
176
+ "source": source,
177
+ "chars": len(text),
178
+ "seconds": round(elapsed, 2),
179
+ }
180
+ )
181
+
182
+ print(f"{source.upper():12s} | chars={len(text):5d} | {elapsed:6.2f}s", flush=True)
183
+
184
+ block_elapsed = time.time() - block_start_time
185
+ body = "\n\n".join(pieces).strip()
186
+
187
+ header = (
188
+ f"## {block['section']} :: {block['subsection']}\n\n"
189
+ f"- scope_label: {block['scope_label']}\n"
190
+ f"- source_doc: {block['source_doc']}\n"
191
+ f"- pages: {start}-{end}\n"
192
+ )
193
+
194
+ meta = {
195
+ "section": block["section"],
196
+ "subsection": block["subsection"],
197
+ "scope_label": block["scope_label"],
198
+ "source_doc": block["source_doc"],
199
+ "pages": [start, end],
200
+ "page_stats": page_stats,
201
+ "total_chars": len(body),
202
+ "seconds": round(block_elapsed, 2),
203
+ }
204
+
205
+ print(f"[DONE BLOCK] {block['section']} :: {block['subsection']} | {block_elapsed:.2f}s")
206
+ return (header + ("\n" + body + "\n" if body else "\n"), meta)
207
+
208
+ def write_markdown(pdf_path: Path, blocks: list[dict], out_path: Path, title: str) -> list[dict]:
209
+ print()
210
+ print("#" * 100)
211
+ print(f"[START] Building markdown for: {title}")
212
+ print(f"[PDF] {pdf_path}")
213
+ print(f"[OUT] {out_path}")
214
+ print(f"[BLOCKS] {len(blocks)}")
215
+ print("#" * 100)
216
+
217
+ sections = [f"# {title}", ""]
218
+ report_rows = []
219
+
220
+ start_time = time.time()
221
+ with fitz.open(pdf_path) as doc:
222
+ for idx, block in enumerate(blocks, start=1):
223
+ md, meta = block_to_markdown(
224
+ doc=doc,
225
+ block=block,
226
+ block_index=idx,
227
+ total_blocks=len(blocks),
228
+ pdf_label=pdf_path.name,
229
+ )
230
+ sections.append(md)
231
+ report_rows.append(meta)
232
+
233
+ out_path.write_text("\n\n".join(sections).strip() + "\n", encoding="utf-8")
234
+ elapsed = time.time() - start_time
235
+ print(f"[DONE FILE] {title} -> {out_path} | {elapsed:.2f}s")
236
+ return report_rows
237
+
238
+
239
+ def main() -> None:
240
+ total_start = time.time()
241
+
242
+ print("[INFO] Markdown preprocess started")
243
+ print(f"[INFO] General PDF : {GENERAL_PDF}")
244
+ print(f"[INFO] Complete PDF: {COMPLETE_PDF}")
245
+ print(f"[INFO] General MD : {GENERAL_MD}")
246
+ print(f"[INFO] Complete MD : {COMPLETE_MD}")
247
+ print(f"[INFO] Report Path : {REPORT_PATH}")
248
+
249
+ general_report = write_markdown(
250
+ GENERAL_PDF,
251
+ GENERAL_BLOCKS,
252
+ GENERAL_MD,
253
+ "General Handbook (Structured Markdown)",
254
+ )
255
+ complete_report = write_markdown(
256
+ COMPLETE_PDF,
257
+ COMPLETE_BLOCKS,
258
+ COMPLETE_MD,
259
+ "Complete Handbook (Structured Markdown)",
260
+ )
261
+
262
+ report = {
263
+ "generated_files": {
264
+ "general_markdown": str(GENERAL_MD),
265
+ "complete_markdown": str(COMPLETE_MD),
266
+ },
267
+ "general_block_count": len(general_report),
268
+ "complete_block_count": len(complete_report),
269
+ "general_blocks": general_report,
270
+ "complete_blocks": complete_report,
271
+ "total_seconds": round(time.time() - total_start, 2),
272
+ }
273
+
274
+ REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
275
+
276
+ print()
277
+ print("=" * 100)
278
+ print(f"Wrote: {GENERAL_MD}")
279
+ print(f"Wrote: {COMPLETE_MD}")
280
+ print(f"Wrote: {REPORT_PATH}")
281
+ print(f"[ALL DONE] Total time: {time.time() - total_start:.2f}s")
282
+ print("=" * 100)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()
UM_Handbook/UM_SFT_QA_Dataset_Builder_from_Index.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ import json
6
+ import re
7
+ from typing import Any
8
+
9
+ from um_handbook_config import DATA_ROOT, INDEX_DIR, CHUNKS_DIR, SFT_DIR
10
+
11
+ PROJECT_DIR = Path(__file__).resolve().parent
12
+
13
+ DATA_ROOT.mkdir(exist_ok=True)
14
+ INDEX_DIR.mkdir(exist_ok=True)
15
+ CHUNKS_DIR.mkdir(exist_ok=True)
16
+ SFT_DIR.mkdir(exist_ok=True)
17
+
18
+ INDEX_PATH = INDEX_DIR / "UM_Manual_Index.json"
19
+ CHUNK_PATH = CHUNKS_DIR / "Source_Chunks_Dataset.jsonl"
20
+
21
+ METADATA_PATH = SFT_DIR / "SFT_QA_Metadata_Draft.jsonl"
22
+ METADATA_PRETTY_PATH = SFT_DIR / "SFT_QA_Metadata_Draft_pretty.json"
23
+ TRAINING_READY_PATH = SFT_DIR / "SFT_QA_Training_Draft.jsonl"
24
+ TRAINING_READY_PRETTY_PATH = SFT_DIR / "SFT_QA_Training_Draft_pretty.json"
25
+ REPORT_PATH = SFT_DIR / "SFT_QA_Draft_Build_Report.json"
26
+
27
+ MANUAL_QA_OVERRIDES = {
28
+ "UMI-0112": "The UM student dress code poster says that all Universiti Malaya students must follow the Universiti Malaya Administrative Directions (Student Dress Code and Appearance) 2024 while on campus. It illustrates three main attire contexts: formal or traditional formal attire for official events, neat and presentable campus attire for lectures, office matters, examinations, and library use, and sportswear for sports and recreational activities. The poster also states that non-compliance may lead to reprimand or other administrative action.",
29
+ "UMI-0379": "The poster emphasizes that students must comply with the Universiti Malaya Administrative Directions on dress code and appearance while on campus. The overall message is to dress in a neat, presentable, and context-appropriate way, with different attire illustrated for official events, normal academic or administrative settings, and sports or recreational activities.",
30
+ "UMI-0380": "The poster states that academic, administrative, library, and security staff are authorised to reprimand students verbally or in writing if they violate the dress code directions. It also says that a student who does not comply may be prevented from entering or dealing in areas where the provisions apply, and other administrative actions may be taken from time to time.",
31
+ "UMI-0381": "For official events, the poster illustrates formal attire, including suit-style clothing and traditional formal wear, to convey a neat and official appearance appropriate for formal university occasions.",
32
+ "UMI-0382": "For lectures, office matters, examinations, and library use, the poster illustrates neat and presentable campus attire rather than ceremonial or sports clothing. The examples shown are everyday academic or administrative outfits suitable for being on campus in those settings.",
33
+ "UMI-0383": "The poster distinguishes attire by activity. Sportswear is illustrated for sports and recreational activities, while official events use formal or traditional formal clothing, and lectures, office matters, examinations, and library use are shown with neat everyday campus attire. In other words, students are expected to dress according to the setting or activity."
34
+ }
35
+
36
+ BAD_ANSWER_PATTERNS = [
37
+ r"^###\s*Page\s+\d+",
38
+ r"\bmore info\b",
39
+ r"fsktm[_\.]?um",
40
+ r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
41
+ r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
42
+ r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
43
+ ]
44
+
45
+
46
+ def normalize_text(text: str) -> str:
47
+ if text is None:
48
+ return ""
49
+ text = str(text).replace("\u00a0", " ").replace("\xad", "")
50
+ text = text.replace("fi", "fi").replace("fl", "fl")
51
+ text = re.sub(r"[ \t]+", " ", text)
52
+ text = re.sub(r"\n{3,}", "\n\n", text)
53
+ return text.strip()
54
+
55
+
56
+ def normalize_for_compare(text: str) -> str:
57
+ text = normalize_text(text).lower()
58
+ text = re.sub(r"[^a-z0-9\s]+", " ", text)
59
+ text = re.sub(r"\s+", " ", text)
60
+ return text.strip()
61
+
62
+
63
+ def tokenize(text: str) -> set[str]:
64
+ return set(t for t in normalize_for_compare(text).split() if len(t) >= 2)
65
+
66
+
67
+ def safe_slug(text: str) -> str:
68
+ text = normalize_for_compare(text)
69
+ text = re.sub(r"\s+", "_", text)
70
+ return text[:80] if text else "item"
71
+
72
+
73
+ def truncate_text(text: str, max_chars: int = 1200) -> str:
74
+ text = normalize_text(text)
75
+ if len(text) <= max_chars:
76
+ return text
77
+ cut = text[:max_chars].rstrip()
78
+ last_break = max(cut.rfind(". "), cut.rfind("\n"))
79
+ if last_break > 200:
80
+ return cut[: last_break + 1].strip()
81
+ return cut.strip() + " ..."
82
+
83
+
84
+ def load_json(path: Path) -> Any:
85
+ return json.loads(path.read_text(encoding="utf-8"))
86
+
87
+
88
+ def load_jsonl(path: Path) -> list[dict]:
89
+ rows = []
90
+ if not path.exists():
91
+ return rows
92
+ with path.open("r", encoding="utf-8") as f:
93
+ for line in f:
94
+ line = line.strip()
95
+ if line:
96
+ rows.append(json.loads(line))
97
+ return rows
98
+
99
+
100
+ def write_jsonl(path: Path, rows: list[dict]) -> None:
101
+ with path.open("w", encoding="utf-8") as f:
102
+ for row in rows:
103
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
104
+
105
+
106
+ def write_pretty_json(path: Path, rows: list[dict]) -> None:
107
+ path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
108
+
109
+
110
+ def first_non_empty(d: dict, keys: list[str], default: Any = "") -> Any:
111
+ for k in keys:
112
+ if k in d and d[k] not in (None, "", [], {}):
113
+ return d[k]
114
+ return default
115
+
116
+
117
+ def ensure_list(value: Any) -> list:
118
+ if value is None:
119
+ return []
120
+ if isinstance(value, list):
121
+ return value
122
+ return [value]
123
+
124
+
125
+ def extract_index_items(raw: Any) -> list[dict]:
126
+ if isinstance(raw, list):
127
+ return raw
128
+ if isinstance(raw, dict):
129
+ for key in ["entries", "items", "index", "records", "data", "rows"]:
130
+ value = raw.get(key)
131
+ if isinstance(value, list) and all(isinstance(x, dict) for x in value):
132
+ return value
133
+ raise ValueError(f"Unsupported index JSON structure in {INDEX_PATH}")
134
+
135
+
136
+ def index_id(row: dict, idx: int) -> str:
137
+ return str(first_non_empty(row, ["index_id", "id", "question_id"], f"idx_{idx:05d}"))
138
+
139
+
140
+ def index_question(row: dict) -> str:
141
+ return str(first_non_empty(row, ["canonical_question", "question", "core_question", "query"], "")).strip()
142
+
143
+
144
+ def index_scope(row: dict) -> str:
145
+ return str(first_non_empty(row, ["scope_label", "scope", "label"], "")).strip().lower()
146
+
147
+
148
+ def index_section(row: dict) -> str:
149
+ return str(first_non_empty(row, ["section"], "")).strip()
150
+
151
+
152
+ def index_subsection(row: dict) -> str:
153
+ return str(first_non_empty(row, ["subsection", "sub_section"], "")).strip()
154
+
155
+
156
+ def index_source_docs(row: dict) -> list[str]:
157
+ docs = first_non_empty(row, ["source_docs", "source_doc", "source_documents"], [])
158
+ return [str(x).strip() for x in ensure_list(docs) if str(x).strip()]
159
+
160
+
161
+ def index_keywords(row: dict) -> list[str]:
162
+ kws = first_non_empty(row, ["keywords", "tags"], [])
163
+ return [str(x).strip() for x in ensure_list(kws) if str(x).strip()]
164
+
165
+
166
+ def chunk_text(chunk: dict) -> str:
167
+ return normalize_text(first_non_empty(chunk, ["text", "chunk_text", "content", "body", "markdown_text"], ""))
168
+
169
+
170
+ def chunk_id(chunk: dict, idx: int) -> str:
171
+ value = first_non_empty(chunk, ["chunk_id", "id"], "")
172
+ return str(value) if value else f"chunk_{idx:06d}"
173
+
174
+
175
+ def chunk_scope(chunk: dict) -> str:
176
+ return str(first_non_empty(chunk, ["scope_label", "scope", "label"], "")).strip().lower()
177
+
178
+
179
+ def chunk_source_doc(chunk: dict) -> str:
180
+ return str(first_non_empty(chunk, ["source_doc", "source_document", "doc_name"], "")).strip()
181
+
182
+
183
+ def chunk_section(chunk: dict) -> str:
184
+ return str(first_non_empty(chunk, ["section"], "")).strip()
185
+
186
+
187
+ def chunk_subsection(chunk: dict) -> str:
188
+ return str(first_non_empty(chunk, ["subsection", "sub_section"], "")).strip()
189
+
190
+
191
+ def chunk_pages(chunk: dict) -> list[int]:
192
+ pages = first_non_empty(chunk, ["pages", "source_pages", "page_range"], [])
193
+ if isinstance(pages, list):
194
+ return pages
195
+ if isinstance(pages, tuple):
196
+ return list(pages)
197
+ return ensure_list(pages)
198
+
199
+
200
+ def overlap_score(a: set[str], b: set[str]) -> int:
201
+ return len(a & b)
202
+
203
+
204
+ def looks_like_bad_answer(text: str) -> bool:
205
+ t = normalize_text(text)
206
+ if not t:
207
+ return True
208
+
209
+ compact = t.replace("\n", " ")
210
+ for pattern in BAD_ANSWER_PATTERNS:
211
+ if re.search(pattern, compact, flags=re.IGNORECASE):
212
+ return True
213
+
214
+ alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
215
+
216
+ # allow short but valid statements such as vision / mission
217
+ if len(alpha_words) < 3:
218
+ return True
219
+ if len(compact) < 15:
220
+ return True
221
+
222
+ return False
223
+
224
+
225
+ def score_chunk_for_index(index_row: dict, chunk: dict) -> tuple[int, dict]:
226
+ score = 0
227
+ reasons: dict[str, Any] = {}
228
+
229
+ q = index_question(index_row)
230
+ q_tokens = tokenize(q)
231
+ sec = index_section(index_row)
232
+ subsec = index_subsection(index_row)
233
+ scope = index_scope(index_row)
234
+ src_docs = index_source_docs(index_row)
235
+ kws = index_keywords(index_row)
236
+
237
+ c_text = chunk_text(chunk)
238
+ c_tokens = tokenize(c_text)
239
+ c_scope = chunk_scope(chunk)
240
+ c_src = chunk_source_doc(chunk)
241
+ c_sec = chunk_section(chunk)
242
+ c_subsec = chunk_subsection(chunk)
243
+ linked_ids = set(chunk.get("linked_index_ids", []))
244
+
245
+ if index_row.get("index_id") in linked_ids:
246
+ score += 100
247
+ reasons["linked_index_match"] = True
248
+
249
+ if scope and c_scope and scope == c_scope:
250
+ score += 30
251
+ reasons["scope_match"] = True
252
+
253
+ if sec and c_sec and normalize_for_compare(sec) == normalize_for_compare(c_sec):
254
+ score += 30
255
+ reasons["section_match"] = True
256
+
257
+ if subsec and c_subsec and normalize_for_compare(subsec) == normalize_for_compare(c_subsec):
258
+ score += 30
259
+ reasons["subsection_match"] = True
260
+
261
+ if src_docs:
262
+ for d in src_docs:
263
+ if normalize_for_compare(d) == normalize_for_compare(c_src):
264
+ score += 20
265
+ reasons["source_doc_match"] = True
266
+ break
267
+
268
+ kw_tokens = set()
269
+ for kw in kws:
270
+ kw_tokens |= tokenize(kw)
271
+ kw_overlap = overlap_score(kw_tokens, c_tokens)
272
+ if kw_overlap:
273
+ score += kw_overlap * 5
274
+ reasons["keyword_overlap"] = kw_overlap
275
+
276
+ qt_overlap = overlap_score(q_tokens, c_tokens)
277
+ if qt_overlap:
278
+ score += qt_overlap * 3
279
+ reasons["question_overlap"] = qt_overlap
280
+
281
+ heading_tokens = tokenize(f"{c_sec} {c_subsec}")
282
+ heading_overlap = overlap_score(q_tokens | kw_tokens, heading_tokens)
283
+ if heading_overlap:
284
+ score += heading_overlap * 8
285
+ reasons["heading_overlap"] = heading_overlap
286
+
287
+ return score, reasons
288
+
289
+
290
+ def choose_best_chunk(index_row: dict, chunks: list[dict]) -> tuple[dict | None, dict]:
291
+ idxid = index_row.get("index_id")
292
+ linked_candidates = [c for c in chunks if idxid in set(c.get("linked_index_ids", []))]
293
+
294
+ # Strong preference: if exact linked candidates exist, only score within that subset.
295
+ candidates = linked_candidates if linked_candidates else chunks
296
+
297
+ best_chunk = None
298
+ best_score = -1
299
+ best_reasons: dict[str, Any] = {}
300
+
301
+ for chunk in candidates:
302
+ score, reasons = score_chunk_for_index(index_row, chunk)
303
+ if score > best_score:
304
+ best_chunk = chunk
305
+ best_score = score
306
+ best_reasons = reasons
307
+
308
+ if best_chunk is None:
309
+ return None, {"best_score": -1, "match_reasons": {}}
310
+
311
+ # When exact linked candidates exist, allow a lower threshold because section/subsection mapping
312
+ # is already controlled by the chunk builder. Otherwise require a stricter score.
313
+ min_required = 120 if linked_candidates else 160
314
+ if best_score < min_required:
315
+ return None, {"best_score": best_score, "match_reasons": best_reasons}
316
+
317
+ return best_chunk, {"best_score": best_score, "match_reasons": best_reasons}
318
+
319
+
320
+
321
+
322
+
323
+ def extract_identity_line(lines: list[str], question_norm: str) -> str:
324
+ """
325
+ Handle faculty identity pages that may not preserve explicit VISION / MISSION labels
326
+ after PDF extraction. In some handbook pages the two statement lines appear before
327
+ the labels or with spaced-out labels such as "v i s i o n".
328
+ """
329
+ cleaned = []
330
+ for ln in lines:
331
+ ln = normalize_text(ln)
332
+ if not ln:
333
+ continue
334
+ if re.fullmatch(r"\d+", ln):
335
+ continue
336
+ if re.fullmatch(r"[vmiohsnaetcrpbjdu ]+", ln.lower()):
337
+ continue
338
+ cleaned.append(ln)
339
+
340
+ def maybe_join(idx: int) -> str:
341
+ line = cleaned[idx]
342
+ if idx + 1 < len(cleaned):
343
+ nxt = cleaned[idx + 1]
344
+ if len(nxt.split()) <= 8 and not nxt.lower().startswith("to "):
345
+ if line.endswith(("through", "and", "for", "to", "of")):
346
+ return f"{line} {nxt}".strip()
347
+ return line
348
+
349
+ if " vision " in question_norm:
350
+ for i, ln in enumerate(cleaned):
351
+ if ln.lower().startswith("a ") and "faculty" in ln.lower():
352
+ return truncate_text(maybe_join(i), 300)
353
+
354
+ if " mission " in question_norm:
355
+ for i, ln in enumerate(cleaned):
356
+ if ln.lower().startswith("to "):
357
+ return truncate_text(maybe_join(i), 300)
358
+
359
+ return ""
360
+
361
+
362
+ def extract_labeled_answer(text: str, label: str, stop_labels: list[str]) -> str:
363
+ stop_group = "|".join(re.escape(x) for x in stop_labels)
364
+ pattern = rf"\b{re.escape(label)}\b\s*(.+?)(?:\b(?:{stop_group})\b|$)"
365
+ m = re.search(pattern, text, flags=re.IGNORECASE | re.DOTALL)
366
+ if not m:
367
+ return ""
368
+ ans = normalize_text(m.group(1)).replace("\n", " ")
369
+ ans = re.sub(r"\s{2,}", " ", ans).strip()
370
+ return truncate_text(ans, 600)
371
+
372
+
373
+ def score_segment(segment: str, question: str, keywords: list[str]) -> int:
374
+ seg_tokens = tokenize(segment)
375
+ score = overlap_score(seg_tokens, tokenize(question)) * 3
376
+ kw_tokens = set()
377
+ for kw in keywords:
378
+ kw_tokens |= tokenize(kw)
379
+ score += overlap_score(seg_tokens, kw_tokens) * 5
380
+ return score
381
+
382
+
383
+ def extract_answer_from_chunk(index_row: dict, chunk: dict) -> str:
384
+ text = chunk_text(chunk)
385
+ q = index_question(index_row)
386
+ q_norm = " " + normalize_for_compare(q) + " "
387
+ kws = index_keywords(index_row)
388
+
389
+ # Explicit handling for faculty identity pages
390
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
391
+
392
+ if " vision " in q_norm:
393
+ ans = extract_labeled_answer(text, "VISION", ["MISSION", "OBJECTIVES", "HISTORY", "STAFF"])
394
+ if ans and not looks_like_bad_answer(ans):
395
+ return ans
396
+ ans = extract_identity_line(lines, q_norm)
397
+ if ans and not looks_like_bad_answer(ans):
398
+ return ans
399
+
400
+ if " mission " in q_norm:
401
+ ans = extract_labeled_answer(text, "MISSION", ["OBJECTIVES", "HISTORY", "STAFF"])
402
+ if ans and not looks_like_bad_answer(ans):
403
+ return ans
404
+ ans = extract_identity_line(lines, q_norm)
405
+ if ans and not looks_like_bad_answer(ans):
406
+ return ans
407
+
408
+ if " objective " in q_norm or " objectives " in q_norm:
409
+ ans = extract_labeled_answer(text, "OBJECTIVES", ["HISTORY", "STAFF", "PROGRAMME", "ACADEMIC CALENDAR"])
410
+ if ans and not looks_like_bad_answer(ans):
411
+ return ans
412
+
413
+ # Fallback: choose best short segment(s)
414
+ segments = []
415
+
416
+ if lines:
417
+ for i in range(len(lines)):
418
+ for span in (1, 2, 3):
419
+ piece = " ".join(lines[i:i + span]).strip()
420
+ if piece:
421
+ segments.append(piece)
422
+ else:
423
+ segments = re.split(r"(?<=[.!?])\s+", text)
424
+
425
+ scored = []
426
+ for seg in segments:
427
+ seg = normalize_text(seg)
428
+ if not seg or looks_like_bad_answer(seg):
429
+ continue
430
+ score = score_segment(seg, q, kws)
431
+ if score > 0:
432
+ scored.append((score, seg))
433
+
434
+ if not scored:
435
+ cleaned = truncate_text(text, 900)
436
+ return "" if looks_like_bad_answer(cleaned) else cleaned
437
+
438
+ scored.sort(key=lambda x: (-x[0], len(x[1])))
439
+ chosen = []
440
+ seen = set()
441
+ for _, seg in scored:
442
+ seg_key = normalize_for_compare(seg)
443
+ if seg_key in seen:
444
+ continue
445
+ chosen.append(seg)
446
+ seen.add(seg_key)
447
+ if len(" ".join(chosen)) > 450 or len(chosen) >= 3:
448
+ break
449
+
450
+ answer = normalize_text(" ".join(chosen))
451
+ answer = truncate_text(answer, 700)
452
+
453
+ return "" if looks_like_bad_answer(answer) else answer
454
+
455
+
456
+
457
+
458
+ def manual_override_answer(index_row: dict) -> str:
459
+ idx = str(index_row.get("index_id", "")).strip()
460
+ return MANUAL_QA_OVERRIDES.get(idx, "")
461
+ def build_metadata_row(index_row: dict, chosen_chunk: dict | None, match_meta: dict, qa_idx: int) -> dict:
462
+ qa_id = f"qa_{qa_idx:06d}"
463
+ idx_id = index_id(index_row, qa_idx)
464
+ scope = index_scope(index_row)
465
+ sec = index_section(index_row)
466
+ subsec = index_subsection(index_row)
467
+ src_docs = index_source_docs(index_row)
468
+ kws = index_keywords(index_row)
469
+
470
+ if chosen_chunk is None:
471
+ manual_answer = manual_override_answer(index_row)
472
+ return {
473
+ "qa_id": qa_id,
474
+ "index_id": idx_id,
475
+ "question": index_question(index_row),
476
+ "answer": manual_answer,
477
+ "scope_label": scope,
478
+ "source_doc": src_docs[0] if src_docs else "",
479
+ "section": sec,
480
+ "subsection": subsec,
481
+ "chunk_id": "",
482
+ "source_pages": [],
483
+ "keywords": kws,
484
+ "source_docs_from_index": src_docs,
485
+ "retrieval_tags": [x for x in [scope, safe_slug(sec), safe_slug(subsec)] if x],
486
+ "manual_review_priority": first_non_empty(index_row, ["manual_review_priority", "priority"], "normal"),
487
+ "review_status": "manual_visual_override" if manual_answer else "unmatched",
488
+ "match_score": match_meta.get("best_score", -1),
489
+ "match_reasons": match_meta.get("match_reasons", {}),
490
+ "notes": first_non_empty(index_row, ["note", "notes"], ""),
491
+ }
492
+
493
+ answer = manual_override_answer(index_row)
494
+ status = "manual_visual_override" if answer else "matched_needs_review"
495
+ if not answer:
496
+ answer = extract_answer_from_chunk(index_row, chosen_chunk)
497
+ if not answer:
498
+ status = "bad_match_filtered"
499
+
500
+ return {
501
+ "qa_id": qa_id,
502
+ "index_id": idx_id,
503
+ "question": index_question(index_row),
504
+ "answer": answer,
505
+ "scope_label": scope,
506
+ "source_doc": chunk_source_doc(chosen_chunk),
507
+ "section": sec,
508
+ "subsection": subsec,
509
+ "chunk_id": chosen_chunk.get("chunk_id", ""),
510
+ "source_pages": chunk_pages(chosen_chunk),
511
+ "keywords": kws,
512
+ "source_docs_from_index": src_docs,
513
+ "retrieval_tags": [x for x in [scope, safe_slug(sec), safe_slug(subsec)] if x],
514
+ "manual_review_priority": first_non_empty(index_row, ["manual_review_priority", "priority"], "normal"),
515
+ "review_status": status,
516
+ "match_score": match_meta.get("best_score", -1),
517
+ "match_reasons": match_meta.get("match_reasons", {}),
518
+ "notes": first_non_empty(index_row, ["note", "notes"], ""),
519
+ }
520
+
521
+
522
+ def build_training_row(metadata_row: dict) -> dict:
523
+ return {
524
+ "qa_id": metadata_row["qa_id"],
525
+ "index_id": metadata_row["index_id"],
526
+ "question": metadata_row["question"],
527
+ "answer": metadata_row["answer"],
528
+ }
529
+
530
+
531
+ def main() -> None:
532
+ print("[INFO] Loading index...")
533
+ raw_index = load_json(INDEX_PATH)
534
+ index_rows = extract_index_items(raw_index)
535
+ print(f"[INFO] Loaded index items: {len(index_rows)}")
536
+
537
+ print("[INFO] Loading chunks...")
538
+ chunks = load_jsonl(CHUNK_PATH)
539
+ print(f"[INFO] Loaded chunk rows: {len(chunks)}")
540
+
541
+ normalized_chunks = []
542
+ for i, ch in enumerate(chunks, start=1):
543
+ row = dict(ch)
544
+ if not row.get("chunk_id"):
545
+ row["chunk_id"] = chunk_id(row, i)
546
+ normalized_chunks.append(row)
547
+
548
+ metadata_rows = []
549
+ training_rows = []
550
+ unmatched_count = 0
551
+ matched_count = 0
552
+ filtered_bad_match_count = 0
553
+
554
+ for i, idx_row in enumerate(index_rows, start=1):
555
+ idx_row = dict(idx_row)
556
+ if not idx_row.get("index_id"):
557
+ idx_row["index_id"] = index_id(idx_row, i)
558
+
559
+ chosen_chunk, match_meta = choose_best_chunk(idx_row, normalized_chunks)
560
+ metadata_row = build_metadata_row(idx_row, chosen_chunk, match_meta, i)
561
+ metadata_rows.append(metadata_row)
562
+
563
+ if metadata_row["review_status"] == "unmatched":
564
+ unmatched_count += 1
565
+ continue
566
+
567
+ if metadata_row["review_status"] == "bad_match_filtered" or not metadata_row["answer"]:
568
+ filtered_bad_match_count += 1
569
+ continue
570
+
571
+ training_rows.append(build_training_row(metadata_row))
572
+ matched_count += 1
573
+
574
+ write_jsonl(METADATA_PATH, metadata_rows)
575
+ write_pretty_json(METADATA_PRETTY_PATH, metadata_rows)
576
+ write_jsonl(TRAINING_READY_PATH, training_rows)
577
+ write_pretty_json(TRAINING_READY_PRETTY_PATH, training_rows)
578
+
579
+ report = {
580
+ "stage": "baseline_1",
581
+ "format": "question_answer_only",
582
+ "inputs": {
583
+ "index_path": str(INDEX_PATH),
584
+ "chunk_path": str(CHUNK_PATH),
585
+ },
586
+ "outputs": {
587
+ "metadata_path": str(METADATA_PATH),
588
+ "metadata_pretty_path": str(METADATA_PRETTY_PATH),
589
+ "training_ready_path": str(TRAINING_READY_PATH),
590
+ "training_ready_pretty_path": str(TRAINING_READY_PRETTY_PATH),
591
+ },
592
+ "counts": {
593
+ "index_rows": len(index_rows),
594
+ "chunk_rows": len(normalized_chunks),
595
+ "metadata_rows": len(metadata_rows),
596
+ "training_ready_rows": len(training_rows),
597
+ "matched_rows": matched_count,
598
+ "unmatched_rows": unmatched_count,
599
+ "filtered_bad_match_rows": filtered_bad_match_count,
600
+ },
601
+ "notes": [
602
+ "This build is for Baseline 1 only.",
603
+ "Training-ready rows contain only question and answer fields.",
604
+ "Exact linked_index_id candidates are preferred when available.",
605
+ "Bad cover/content/heading-only answers are filtered out.",
606
+ "Vision/Mission/Objectives questions use explicit label-aware extraction when possible.",
607
+ ],
608
+ }
609
+
610
+ REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
611
+
612
+ print(f"Wrote: {METADATA_PATH}")
613
+ print(f"Wrote: {METADATA_PRETTY_PATH}")
614
+ print(f"Wrote: {TRAINING_READY_PATH}")
615
+ print(f"Wrote: {TRAINING_READY_PRETTY_PATH}")
616
+ print(f"Wrote: {REPORT_PATH}")
617
+
618
+
619
+ if __name__ == "__main__":
620
+ main()
UM_Handbook/UM_Source_Chunk_Dataset_Builder.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+ import json
4
+ import re
5
+ from collections import defaultdict
6
+
7
+ from um_handbook_config import DATA_ROOT, MARKDOWN_DIR, INDEX_DIR, CHUNKS_DIR
8
+
9
+ PROJECT_DIR = Path(__file__).resolve().parent
10
+
11
+ DATA_ROOT.mkdir(exist_ok=True)
12
+ MARKDOWN_DIR.mkdir(exist_ok=True)
13
+ INDEX_DIR.mkdir(exist_ok=True)
14
+ CHUNKS_DIR.mkdir(exist_ok=True)
15
+
16
+ GENERAL_MD = MARKDOWN_DIR / "general_handbook_structured.md"
17
+ COMPLETE_MD = MARKDOWN_DIR / "complete_handbook_structured.md"
18
+ INDEX_PATH = INDEX_DIR / "UM_Manual_Index.json"
19
+
20
+ OUT_JSONL = CHUNKS_DIR / "Source_Chunks_Dataset.jsonl"
21
+ OUT_PRETTY = CHUNKS_DIR / "Source_Chunks_Dataset_pretty.json"
22
+ OUT_REPORT = CHUNKS_DIR / "Source_Chunks_Dataset_report.json"
23
+
24
+ STOPWORDS = {
25
+ "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "from", "is", "are", "was", "were", "what", "which",
26
+ "who", "how", "when", "where", "why", "this", "that", "these", "those", "their", "there", "into", "about", "under",
27
+ "through", "using", "used", "students", "student", "programme", "program", "handbook", "faculty", "computer", "science",
28
+ "information", "technology", "universiti", "malaya",
29
+ }
30
+
31
+ SPECIAL_SINGLE_CHUNK_SECTIONS = {"Student Dress Code"}
32
+
33
+ BAD_CHUNK_PATTERNS = [
34
+ r"\bmore info\b",
35
+ r"fsktm[_\.]?um",
36
+ r"POSTGRADUATE\s+PROGRAMME\s+HANDBOOK",
37
+ r"UNDERGRADUATE\s+PROGRAMME\s+HANDBOOK",
38
+ r"^C\s*O\s*N\s*T\s*E\s*N\s*T\s*S$",
39
+ ]
40
+
41
+
42
+ def normalize_text(text: str) -> str:
43
+ text = text.replace("\u00a0", " ").replace("\xad", "")
44
+ text = text.replace("fi", "fi").replace("fl", "fl")
45
+ text = re.sub(r"[ \t]+", " ", text)
46
+ text = re.sub(r"\n{3,}", "\n\n", text)
47
+ return text.strip()
48
+
49
+
50
+ def read_markdown_sections(path: Path) -> list[dict]:
51
+ text = path.read_text(encoding="utf-8")
52
+ raw_sections = re.split(r"(?m)^## ", text)
53
+ sections = []
54
+ for part in raw_sections:
55
+ if not part.strip() or part.startswith("# "):
56
+ continue
57
+ lines = part.splitlines()
58
+ title = lines[0].strip()
59
+ meta = {"title": title}
60
+ i = 1
61
+ while i < len(lines) and not lines[i].strip():
62
+ i += 1
63
+ while i < len(lines) and lines[i].startswith("- "):
64
+ k, _, v = lines[i][2:].partition(":")
65
+ meta[k.strip()] = v.strip()
66
+ i += 1
67
+ while i < len(lines) and not lines[i].strip():
68
+ i += 1
69
+ body = "\n".join(lines[i:]).strip()
70
+ meta["body"] = body
71
+ section, _, subsection = title.partition("::")
72
+ meta["section"] = section.strip()
73
+ meta["subsection"] = subsection.strip()
74
+ sections.append(meta)
75
+ return sections
76
+
77
+
78
+ def parse_pages(meta_pages: str) -> list[int]:
79
+ if not meta_pages:
80
+ return []
81
+ m = re.match(r"^(\d+)-(\d+)$", meta_pages.strip())
82
+ if m:
83
+ a, b = int(m.group(1)), int(m.group(2))
84
+ return list(range(a, b + 1))
85
+ vals = re.findall(r"\d+", meta_pages)
86
+ return [int(v) for v in vals]
87
+
88
+
89
+ def split_body_by_pages(body: str, fallback_pages: list[int]) -> list[tuple[list[int], str]]:
90
+ body = body.strip()
91
+ matches = list(re.finditer(r"(?m)^### Page (\d+)\s*$", body))
92
+ if not matches:
93
+ return [(fallback_pages, body)]
94
+
95
+ parts = []
96
+ for idx, match in enumerate(matches):
97
+ page_no = int(match.group(1))
98
+ start = match.end()
99
+ end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
100
+ text = body[start:end].strip()
101
+ parts.append(([page_no], text))
102
+ return parts
103
+
104
+
105
+ def clean_text(text: str) -> str:
106
+ lines = []
107
+ for raw in text.splitlines():
108
+ line = raw.strip()
109
+ if not line:
110
+ continue
111
+ if re.fullmatch(r"### Page \d+", line):
112
+ continue
113
+ lines.append(line)
114
+ text = "\n".join(lines)
115
+ text = re.sub(r"\n{3,}", "\n\n", text)
116
+ text = re.sub(r"[ \t]+", " ", text)
117
+ return normalize_text(text)
118
+
119
+
120
+ def looks_like_bad_chunk(text: str) -> bool:
121
+ t = normalize_text(text)
122
+ if not t:
123
+ return True
124
+
125
+ compact = t.replace("\n", " ")
126
+ alpha_words = re.findall(r"[A-Za-z][A-Za-z&'/-]+", compact)
127
+
128
+ if len(alpha_words) < 12:
129
+ return True
130
+
131
+ for pattern in BAD_CHUNK_PATTERNS:
132
+ if re.search(pattern, compact, flags=re.IGNORECASE):
133
+ if len(alpha_words) < 70:
134
+ return True
135
+
136
+ if len(compact) < 140 and compact.upper() == compact:
137
+ return True
138
+
139
+ return False
140
+
141
+
142
+ def chunk_lines(text: str, max_chars: int = 1100) -> list[str]:
143
+ """
144
+ Keep chunks moderately small so answer extraction later can stay focused.
145
+ """
146
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
147
+ if not lines:
148
+ return []
149
+
150
+ chunks = []
151
+ current = []
152
+ current_len = 0
153
+
154
+ for line in lines:
155
+ addition = len(line) + (1 if current else 0)
156
+ if current and current_len + addition > max_chars:
157
+ chunks.append("\n".join(current).strip())
158
+ current = [line]
159
+ current_len = len(line)
160
+ else:
161
+ current.append(line)
162
+ current_len += addition
163
+
164
+ if current:
165
+ chunks.append("\n".join(current).strip())
166
+
167
+ return chunks
168
+
169
+
170
+ def keywords_from_text(text: str, limit: int = 12) -> list[str]:
171
+ words = re.findall(r"[A-Za-z][A-Za-z0-9&/\-']+", text.lower())
172
+ freq = defaultdict(int)
173
+ for w in words:
174
+ if w in STOPWORDS or len(w) < 3:
175
+ continue
176
+ freq[w] += 1
177
+ return [w for w, _ in sorted(freq.items(), key=lambda x: (-x[1], x[0]))[:limit]]
178
+
179
+
180
+ def load_index_entries() -> list[dict]:
181
+ data = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
182
+ if isinstance(data, dict) and "entries" in data:
183
+ return data["entries"]
184
+ raise ValueError(f"Unsupported Manual_Index JSON format in {INDEX_PATH}")
185
+
186
+
187
+ def section_key(section: str, subsection: str) -> str:
188
+ return f"{section.strip()}::{subsection.strip()}"
189
+
190
+
191
+ def main() -> None:
192
+ if not GENERAL_MD.exists() or not COMPLETE_MD.exists():
193
+ raise FileNotFoundError("Run UM_Handbook_Markdown_Preprocess.py first.")
194
+
195
+ index_entries = load_index_entries()
196
+ index_by_key = defaultdict(list)
197
+ for entry in index_entries:
198
+ index_by_key[section_key(entry["section"], entry["subsection"])].append(entry["index_id"])
199
+
200
+ rows = []
201
+ markdown_sections = read_markdown_sections(GENERAL_MD) + read_markdown_sections(COMPLETE_MD)
202
+
203
+ for sec in markdown_sections:
204
+ fallback_pages = parse_pages(sec.get("pages", ""))
205
+ linked = index_by_key.get(section_key(sec["section"], sec["subsection"]), [])
206
+ page_parts = split_body_by_pages(sec["body"], fallback_pages)
207
+
208
+ chunk_index = 0
209
+ for page_list, page_text in page_parts:
210
+ page_text = clean_text(page_text)
211
+ if looks_like_bad_chunk(page_text):
212
+ continue
213
+
214
+ candidate_chunks = [page_text] if sec.get("section") in SPECIAL_SINGLE_CHUNK_SECTIONS else chunk_lines(page_text)
215
+ for chunk in candidate_chunks:
216
+ chunk = clean_text(chunk)
217
+ if looks_like_bad_chunk(chunk):
218
+ continue
219
+
220
+ chunk_index += 1
221
+ row = {
222
+ "chunk_id": f"SC-{len(rows)+1:05d}",
223
+ "source_doc": sec.get("source_doc"),
224
+ "scope_label": sec.get("scope_label"),
225
+ "section": sec.get("section"),
226
+ "subsection": sec.get("subsection"),
227
+ "pages": page_list or fallback_pages,
228
+ "chunk_index": chunk_index,
229
+ "text": chunk,
230
+ "keywords": keywords_from_text(chunk),
231
+ "linked_index_ids": linked,
232
+ }
233
+ rows.append(row)
234
+
235
+ with OUT_JSONL.open("w", encoding="utf-8") as f:
236
+ for row in rows:
237
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
238
+
239
+ OUT_PRETTY.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
240
+
241
+ scope_distribution = {
242
+ k: sum(1 for r in rows if r["scope_label"] == k)
243
+ for k in sorted({r["scope_label"] for r in rows})
244
+ }
245
+
246
+ report = {
247
+ "total_chunks": len(rows),
248
+ "scope_distribution": scope_distribution,
249
+ "notes": [
250
+ "Chunks are generated from the structured markdown files, not directly from raw PDF pages.",
251
+ "Low-information cover/content/divider chunks are filtered out.",
252
+ "Chunk pages are preserved from per-page markdown markers when available.",
253
+ "Linked Manual_Index ids are based on exact section/subsection matches from UM_Manual_Index.json.",
254
+ ],
255
+ }
256
+
257
+ OUT_REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
258
+
259
+ print(f"Wrote: {OUT_JSONL}")
260
+ print(f"Wrote: {OUT_PRETTY}")
261
+ print(f"Wrote: {OUT_REPORT}")
262
+
263
+
264
+ if __name__ == "__main__":
265
+ main()
UM_Handbook/assets/TensorCat.png ADDED
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 12288,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 40960,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": null,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 1000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": false,
67
+ "transformers_version": "5.3.0",
68
+ "use_cache": true,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.3.0"
13
+ }
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e946ac23b6a68f7a2abbe7b3c22190673c6d3d159b85305268db51b2729ac68a
3
+ size 11422749
UM_Handbook/outputs/qwen3_um_handbook_optimized_1/merged_model/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
UM_Handbook/um_handbook_config.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+
4
+ PROJECT_DIR = Path(__file__).resolve().parent
5
+
6
+ # Total data directory
7
+ DATA_ROOT = PROJECT_DIR / "Dataset"
8
+
9
+ # Subdirectories (kept consistent with the existing workflow)
10
+ PDF_DIR = DATA_ROOT / "pdf"
11
+ MARKDOWN_DIR = DATA_ROOT / "markdown"
12
+ INDEX_DIR = DATA_ROOT / "Manual_Index"
13
+ CHUNKS_DIR = DATA_ROOT / "Source Chunk Dataset"
14
+ SFT_DIR = DATA_ROOT / "SFT_Dataset_Draft"
15
+ REPORTS_DIR = DATA_ROOT / "reports"
16
+
17
+ # Input PDFs
18
+ GENERAL_PDF = PDF_DIR / "General Handbook.pdf"
19
+ COMPLETE_PDF = PDF_DIR / "Complete Handbook.pdf"
20
+
21
+ DRESS_CODE_MANUAL_TEXT = """
22
+ UM STUDENT DRESS CODE AND APPEARANCE POSTER SUMMARY
23
+
24
+ Compliance message
25
+ - All Universiti Malaya students must adhere to the Universiti Malaya Administrative Directions (Student Dress Code and Appearance) 2024 while on campus.
26
+
27
+ Illustrated attire categories on the poster
28
+ - Official Events: the poster illustrates formal or traditional formal attire for official university occasions.
29
+ - Lectures, Office Matters, Examination and Library: the poster illustrates neat, presentable campus attire for normal academic and administrative settings.
30
+ - Sports and Recreational: the poster illustrates sportswear for sports and recreational activities.
31
+
32
+ Enforcement and action
33
+ - Academic, administrative, library and security staff members are authorised to reprimand students verbally or in writing if they violate the Administrative Directions.
34
+ - A student who does not comply may be prevented from entering or dealing in areas where the provisions apply.
35
+ - Other administrative actions may also be taken from time to time.
36
+
37
+ Important limitation
38
+ - This poster illustrates categories of appropriate attire and enforcement expectations, but it does not provide an exhaustive item-by-item prohibited clothing list.
39
+ """
40
+
41
+ # ----------------------------
42
+ # General handbook blocks
43
+ # NOTE:
44
+ # This PDF contains two handbook-style front sections. The current workflow
45
+ # intentionally uses the later normalized pages (e.g. 9, 10, 11...) for the
46
+ # "General Handbook" layer, because those pages contain the cleaner normalized
47
+ # general/common content that matches the current index design.
48
+ # ----------------------------
49
+ GENERAL_BLOCKS = [
50
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "Faculty Objectives", "subsection": "Faculty Objectives", "pages": (9, 9)},
51
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "History of the Faculty", "subsection": "History Overview", "pages": (10, 11)},
52
+ {"source_doc": "General Handbook", "scope_label": "postgraduate", "section": "Academic Calendar 2025/2026", "subsection": "Master and Doctorate Level Academic Calendar", "pages": (4, 4)},
53
+ {"source_doc": "General Handbook", "scope_label": "undergraduate", "section": "Academic Calendar 2025/2026", "subsection": "Bachelor Degree Level Academic Calendar", "pages": (12, 12)},
54
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Teaching Labs", "pages": (13, 14)},
55
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Research Labs", "pages": (14, 16)},
56
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Project Based Labs", "pages": (16, 16)},
57
+ {"source_doc": "General Handbook", "scope_label": "general", "section": "Other Facilities", "subsection": "Student Support and Campus Facilities", "pages": (17, 17)},
58
+ ]
59
+
60
+ # ----------------------------
61
+ # Postgraduate programme blocks
62
+ # Pages here are PDF physical pages, not handbook-printed page numbers.
63
+ # These were aligned against the uploaded merged Complete Handbook PDF.
64
+ # ----------------------------
65
+ PG_PROGRAMMES = [
66
+ {
67
+ "code": "PG-AC",
68
+ "name": "Master of Computer Science (Applied Computing)",
69
+ "scope_label": "postgraduate",
70
+ "blocks": [
71
+ ("Programme Requirements", (37, 38)),
72
+ ("Programme Objectives and Outcomes", (39, 40)),
73
+ ("Candidature Requirements", (41, 41)),
74
+ ("Graduate on Time (GOT) Schedule", (42, 42)),
75
+ ("Course Plan", (43, 44)),
76
+ ("List of Courses and Contents", (45, 50)),
77
+ ],
78
+ },
79
+ {
80
+ "code": "PG-SE",
81
+ "name": "Master of Software Engineering (Software Technology)",
82
+ "scope_label": "postgraduate",
83
+ "blocks": [
84
+ ("Programme Requirements", (52, 53)),
85
+ ("Programme Objectives and Outcomes", (54, 55)),
86
+ ("Candidature Requirements", (56, 56)),
87
+ ("Graduate on Time (GOT) Schedule", (57, 57)),
88
+ ("Course Plan", (59, 61)),
89
+ ("List of Courses and Contents", (62, 68)),
90
+ ],
91
+ },
92
+ {
93
+ "code": "PG-DS",
94
+ "name": "Master in Data Science",
95
+ "scope_label": "postgraduate",
96
+ "blocks": [
97
+ ("Programme Requirements", (70, 71)),
98
+ ("Programme Objectives and Outcomes", (72, 74)),
99
+ ("Course Plan", (75, 76)),
100
+ ("List of Courses and Contents", (77, 82)),
101
+ ],
102
+ },
103
+ {
104
+ "code": "PG-CSY",
105
+ "name": "Master of Cyber Security",
106
+ "scope_label": "postgraduate",
107
+ "blocks": [
108
+ ("Programme Requirements", (84, 86)),
109
+ ("Programme Objectives and Outcomes", (87, 88)),
110
+ ("Course Plan", (89, 90)),
111
+ ("List of Courses and Contents", (91, 97)),
112
+ ],
113
+ },
114
+ {
115
+ "code": "PG-AI",
116
+ "name": "Master of Artificial Intelligence",
117
+ "scope_label": "postgraduate",
118
+ "blocks": [
119
+ ("Programme Requirements", (99, 100)),
120
+ ("Programme Objectives and Outcomes", (101, 102)),
121
+ ("Course Plan", (103, 103)),
122
+ ("List of Courses and Contents", (104, 111)),
123
+ ],
124
+ },
125
+ {
126
+ "code": "PG-MR",
127
+ "name": "Master of Computer Science (By Research)",
128
+ "scope_label": "postgraduate",
129
+ "blocks": [
130
+ ("Programme Requirements", (113, 113)),
131
+ ("Learning Objectives and Outcomes", (114, 115)),
132
+ ("Candidature Requirements", (116, 116)),
133
+ ("Graduate on Time (GOT) Schedule", (117, 117)),
134
+ ("Research Methodology / Course Contents", (118, 118)),
135
+ ],
136
+ },
137
+ {
138
+ "code": "PG-PHD",
139
+ "name": "Doctor of Philosophy",
140
+ "scope_label": "postgraduate",
141
+ "blocks": [
142
+ ("Advanced Research Methods Course Content", (120, 120)),
143
+ ("Programme Education Objectives", (121, 121)),
144
+ ("Learning Outcomes", (122, 122)),
145
+ ("Candidature Requirements", (123, 123)),
146
+ ("Proposed Graduate on Time (GOT) Schedule", (124, 124)),
147
+ ],
148
+ },
149
+ ]
150
+
151
+ UG_PROGRAMMES = [
152
+ ("UG-CSN", "Bachelor of Computer Science (Computer System and Network)", (202, 204)),
153
+ ("UG-AI", "Bachelor of Computer Science (Artificial Intelligence)", (206, 208)),
154
+ ("UG-IS", "Bachelor of Computer Science (Information Systems)", (210, 212)),
155
+ ("UG-SE", "Bachelor of Computer Science (Software Engineering)", (214, 216)),
156
+ ("UG-MM", "Bachelor of Computer Science (Multimedia Computing)", (218, 220)),
157
+ ("UG-DS", "Bachelor of Computer Science (Data Science)", (222, 224)),
158
+ ]
159
+
160
+ # ----------------------------
161
+ # Complete handbook blocks
162
+ #
163
+ # IMPORTANT VERIFIED FIX:
164
+ # In the uploaded merged Complete Handbook PDF:
165
+ # - PDF page 186 contains the postgraduate-style Vision/Mission page:
166
+ # Vision: "A globally-influential faculty, enriching lives & shaping the future through computing technology"
167
+ # Mission: "To enrich lives and shape the future for the nation and humanity through education, research and technopreneurship"
168
+ # - PDF page 187 contains the undergraduate-style Vision/Mission page:
169
+ # Vision: "A global faculty impacting the world"
170
+ # Mission: "Propelling computing technology and producing world class leaders"
171
+ #
172
+ # The previous broken mapping pointed both PG and UG identity to the same page.
173
+ # That caused the same answer to be returned for both questions.
174
+ # ----------------------------
175
+ COMPLETE_BLOCKS = [
176
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (186, 186)},
177
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Dean's Office and Management", "pages": (6, 8)},
178
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Artificial Intelligence", "pages": (9, 12)},
179
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Software Engineering", "pages": (13, 16)},
180
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Information Systems", "pages": (17, 20)},
181
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Legislation and Prescribed Rules", "pages": (126, 126)},
182
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Marking Scheme and Grade Point Average (GPA)", "pages": (127, 127)},
183
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Progress Report", "pages": (129, 129)},
184
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Supervision Policy for Postgraduate Programmes", "pages": (130, 137)},
185
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis Preparation Guidelines", "pages": (138, 171)},
186
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis or Dissertation Submission and Examinations", "pages": (172, 172)},
187
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Publication Requirement", "pages": (173, 175)},
188
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Plagiarism", "pages": (176, 176)},
189
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Intellectual Property", "pages": (177, 177)},
190
+ {"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Postgraduate Activities", "pages": (178, 181)},
191
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Laboratory Regulations", "pages": (183, 183)},
192
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Technical Problem Enquiries", "pages": (184, 184)},
193
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (187, 187)},
194
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Undergraduate Dean's Office and Department Leadership", "pages": (192, 199)},
195
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Programmes", "subsection": "Programmes Offered", "pages": (200, 200)},
196
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "University Courses", "pages": (225, 227)},
197
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Faculty Core Courses", "pages": (228, 230)},
198
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Programme Core Courses", "pages": (231, 239)},
199
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Computer System and Network", "pages": (240, 244)},
200
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Artificial Intelligence", "pages": (245, 249)},
201
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Information Systems", "pages": (250, 254)},
202
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Software Engineering", "pages": (255, 259)},
203
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Multimedia Computing", "pages": (260, 264)},
204
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Data Science", "pages": (265, 268)},
205
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Industrial Training", "subsection": "Industrial Training Guidelines", "pages": (270, 280)},
206
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Academic Project", "subsection": "Academic Project I and II Guidelines", "pages": (282, 289)},
207
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Language Path and English Communication", "subsection": "Language Path Course / English Communication Programme 2025/2026", "pages": (292, 296)},
208
+ {"source_doc": "Complete Handbook", "scope_label": "general", "section": "Student Dress Code", "subsection": "Dress Code and Appearance Guides for Universiti Malaya Students", "pages": (297, 298), "manual_text": DRESS_CODE_MANUAL_TEXT},
209
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Rules and Regulations", "subsection": "Examination Honesty and Discipline / Undergraduate Rules", "pages": (299, 300)},
210
+ {"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Examination Grading Scheme", "subsection": "Official University Grades", "pages": (301, 301)},
211
+ ]
212
+
213
+ for code, name, pages in UG_PROGRAMMES:
214
+ COMPLETE_BLOCKS.append({
215
+ "source_doc": "Complete Handbook",
216
+ "scope_label": "undergraduate",
217
+ "section": "Undergraduate Programme Goals and Learning Outcomes",
218
+ "subsection": name,
219
+ "pages": pages,
220
+ })
221
+
222
+ for programme in PG_PROGRAMMES:
223
+ for subsection, pages in programme["blocks"]:
224
+ COMPLETE_BLOCKS.append({
225
+ "source_doc": "Complete Handbook",
226
+ "scope_label": "postgraduate",
227
+ "section": programme["name"],
228
+ "subsection": subsection,
229
+ "pages": pages,
230
+ })