TensorTalk / UM_Handbook /um_handbook_config.py
TensorCat's picture
Upload 30 files
052d67e verified
from pathlib import Path
PROJECT_DIR = Path(__file__).resolve().parent
# Total data directory
DATA_ROOT = PROJECT_DIR / "Dataset"
# Subdirectories (kept consistent with the existing workflow)
PDF_DIR = DATA_ROOT / "pdf"
MARKDOWN_DIR = DATA_ROOT / "markdown"
INDEX_DIR = DATA_ROOT / "Manual_Index"
CHUNKS_DIR = DATA_ROOT / "Source Chunk Dataset"
SFT_DIR = DATA_ROOT / "SFT_Dataset_Draft"
REPORTS_DIR = DATA_ROOT / "reports"
# Input PDFs
GENERAL_PDF = PDF_DIR / "General Handbook.pdf"
COMPLETE_PDF = PDF_DIR / "Complete Handbook.pdf"
DRESS_CODE_MANUAL_TEXT = """
UM STUDENT DRESS CODE AND APPEARANCE POSTER SUMMARY
Compliance message
- All Universiti Malaya students must adhere to the Universiti Malaya Administrative Directions (Student Dress Code and Appearance) 2024 while on campus.
Illustrated attire categories on the poster
- Official Events: the poster illustrates formal or traditional formal attire for official university occasions.
- Lectures, Office Matters, Examination and Library: the poster illustrates neat, presentable campus attire for normal academic and administrative settings.
- Sports and Recreational: the poster illustrates sportswear for sports and recreational activities.
Enforcement and action
- Academic, administrative, library and security staff members are authorised to reprimand students verbally or in writing if they violate the Administrative Directions.
- A student who does not comply may be prevented from entering or dealing in areas where the provisions apply.
- Other administrative actions may also be taken from time to time.
Important limitation
- This poster illustrates categories of appropriate attire and enforcement expectations, but it does not provide an exhaustive item-by-item prohibited clothing list.
"""
# ----------------------------
# General handbook blocks
# NOTE:
# This PDF contains two handbook-style front sections. The current workflow
# intentionally uses the later normalized pages (e.g. 9, 10, 11...) for the
# "General Handbook" layer, because those pages contain the cleaner normalized
# general/common content that matches the current index design.
# ----------------------------
GENERAL_BLOCKS = [
{"source_doc": "General Handbook", "scope_label": "general", "section": "Faculty Objectives", "subsection": "Faculty Objectives", "pages": (9, 9)},
{"source_doc": "General Handbook", "scope_label": "general", "section": "History of the Faculty", "subsection": "History Overview", "pages": (10, 11)},
{"source_doc": "General Handbook", "scope_label": "postgraduate", "section": "Academic Calendar 2025/2026", "subsection": "Master and Doctorate Level Academic Calendar", "pages": (4, 4)},
{"source_doc": "General Handbook", "scope_label": "undergraduate", "section": "Academic Calendar 2025/2026", "subsection": "Bachelor Degree Level Academic Calendar", "pages": (12, 12)},
{"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Teaching Labs", "pages": (13, 14)},
{"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Research Labs", "pages": (14, 16)},
{"source_doc": "General Handbook", "scope_label": "general", "section": "Teaching and Learning Facilities", "subsection": "Project Based Labs", "pages": (16, 16)},
{"source_doc": "General Handbook", "scope_label": "general", "section": "Other Facilities", "subsection": "Student Support and Campus Facilities", "pages": (17, 17)},
]
# ----------------------------
# Postgraduate programme blocks
# Pages here are PDF physical pages, not handbook-printed page numbers.
# These were aligned against the uploaded merged Complete Handbook PDF.
# ----------------------------
PG_PROGRAMMES = [
{
"code": "PG-AC",
"name": "Master of Computer Science (Applied Computing)",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (37, 38)),
("Programme Objectives and Outcomes", (39, 40)),
("Candidature Requirements", (41, 41)),
("Graduate on Time (GOT) Schedule", (42, 42)),
("Course Plan", (43, 44)),
("List of Courses and Contents", (45, 50)),
],
},
{
"code": "PG-SE",
"name": "Master of Software Engineering (Software Technology)",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (52, 53)),
("Programme Objectives and Outcomes", (54, 55)),
("Candidature Requirements", (56, 56)),
("Graduate on Time (GOT) Schedule", (57, 57)),
("Course Plan", (59, 61)),
("List of Courses and Contents", (62, 68)),
],
},
{
"code": "PG-DS",
"name": "Master in Data Science",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (70, 71)),
("Programme Objectives and Outcomes", (72, 74)),
("Course Plan", (75, 76)),
("List of Courses and Contents", (77, 82)),
],
},
{
"code": "PG-CSY",
"name": "Master of Cyber Security",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (84, 86)),
("Programme Objectives and Outcomes", (87, 88)),
("Course Plan", (89, 90)),
("List of Courses and Contents", (91, 97)),
],
},
{
"code": "PG-AI",
"name": "Master of Artificial Intelligence",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (99, 100)),
("Programme Objectives and Outcomes", (101, 102)),
("Course Plan", (103, 103)),
("List of Courses and Contents", (104, 111)),
],
},
{
"code": "PG-MR",
"name": "Master of Computer Science (By Research)",
"scope_label": "postgraduate",
"blocks": [
("Programme Requirements", (113, 113)),
("Learning Objectives and Outcomes", (114, 115)),
("Candidature Requirements", (116, 116)),
("Graduate on Time (GOT) Schedule", (117, 117)),
("Research Methodology / Course Contents", (118, 118)),
],
},
{
"code": "PG-PHD",
"name": "Doctor of Philosophy",
"scope_label": "postgraduate",
"blocks": [
("Advanced Research Methods Course Content", (120, 120)),
("Programme Education Objectives", (121, 121)),
("Learning Outcomes", (122, 122)),
("Candidature Requirements", (123, 123)),
("Proposed Graduate on Time (GOT) Schedule", (124, 124)),
],
},
]
UG_PROGRAMMES = [
("UG-CSN", "Bachelor of Computer Science (Computer System and Network)", (202, 204)),
("UG-AI", "Bachelor of Computer Science (Artificial Intelligence)", (206, 208)),
("UG-IS", "Bachelor of Computer Science (Information Systems)", (210, 212)),
("UG-SE", "Bachelor of Computer Science (Software Engineering)", (214, 216)),
("UG-MM", "Bachelor of Computer Science (Multimedia Computing)", (218, 220)),
("UG-DS", "Bachelor of Computer Science (Data Science)", (222, 224)),
]
# ----------------------------
# Complete handbook blocks
#
# IMPORTANT VERIFIED FIX:
# In the uploaded merged Complete Handbook PDF:
# - PDF page 186 contains the postgraduate-style Vision/Mission page:
# Vision: "A globally-influential faculty, enriching lives & shaping the future through computing technology"
# Mission: "To enrich lives and shape the future for the nation and humanity through education, research and technopreneurship"
# - PDF page 187 contains the undergraduate-style Vision/Mission page:
# Vision: "A global faculty impacting the world"
# Mission: "Propelling computing technology and producing world class leaders"
#
# The previous broken mapping pointed both PG and UG identity to the same page.
# That caused the same answer to be returned for both questions.
# ----------------------------
COMPLETE_BLOCKS = [
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (186, 186)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Dean's Office and Management", "pages": (6, 8)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Artificial Intelligence", "pages": (9, 12)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Software Engineering", "pages": (13, 16)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Department of Information Systems", "pages": (17, 20)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Legislation and Prescribed Rules", "pages": (126, 126)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Postgraduate General Information", "subsection": "Marking Scheme and Grade Point Average (GPA)", "pages": (127, 127)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Progress Report", "pages": (129, 129)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Supervision Policy for Postgraduate Programmes", "pages": (130, 137)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis Preparation Guidelines", "pages": (138, 171)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Thesis or Dissertation Submission and Examinations", "pages": (172, 172)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Publication Requirement", "pages": (173, 175)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Plagiarism", "pages": (176, 176)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Intellectual Property", "pages": (177, 177)},
{"source_doc": "Complete Handbook", "scope_label": "postgraduate", "section": "Research Guidance", "subsection": "Postgraduate Activities", "pages": (178, 181)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Laboratory Regulations", "pages": (183, 183)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Laboratory Regulations and Support", "subsection": "Technical Problem Enquiries", "pages": (184, 184)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Faculty Identity", "subsection": "Vision and Mission", "pages": (187, 187)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Faculty Staff", "subsection": "Undergraduate Dean's Office and Department Leadership", "pages": (192, 199)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Programmes", "subsection": "Programmes Offered", "pages": (200, 200)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "University Courses", "pages": (225, 227)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Faculty Core Courses", "pages": (228, 230)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Programme Core Courses", "pages": (231, 239)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Computer System and Network", "pages": (240, 244)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Artificial Intelligence", "pages": (245, 249)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Information Systems", "pages": (250, 254)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Software Engineering", "pages": (255, 259)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Multimedia Computing", "pages": (260, 264)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Shared Undergraduate Curriculum", "subsection": "Specialization Elective Courses - Data Science", "pages": (265, 268)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Industrial Training", "subsection": "Industrial Training Guidelines", "pages": (270, 280)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Academic Project", "subsection": "Academic Project I and II Guidelines", "pages": (282, 289)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Language Path and English Communication", "subsection": "Language Path Course / English Communication Programme 2025/2026", "pages": (292, 296)},
{"source_doc": "Complete Handbook", "scope_label": "general", "section": "Student Dress Code", "subsection": "Dress Code and Appearance Guides for Universiti Malaya Students", "pages": (297, 298), "manual_text": DRESS_CODE_MANUAL_TEXT},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Undergraduate Rules and Regulations", "subsection": "Examination Honesty and Discipline / Undergraduate Rules", "pages": (299, 300)},
{"source_doc": "Complete Handbook", "scope_label": "undergraduate", "section": "Examination Grading Scheme", "subsection": "Official University Grades", "pages": (301, 301)},
]
for code, name, pages in UG_PROGRAMMES:
COMPLETE_BLOCKS.append({
"source_doc": "Complete Handbook",
"scope_label": "undergraduate",
"section": "Undergraduate Programme Goals and Learning Outcomes",
"subsection": name,
"pages": pages,
})
for programme in PG_PROGRAMMES:
for subsection, pages in programme["blocks"]:
COMPLETE_BLOCKS.append({
"source_doc": "Complete Handbook",
"scope_label": "postgraduate",
"section": programme["name"],
"subsection": subsection,
"pages": pages,
})