{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "2a4a3f73", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Text extracted successfully.\n" ] } ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "def extract_text_from_url(url):\n", " # Send HTTP request\n", " response = requests.get(url)\n", " response.raise_for_status()\n", "\n", " # Parse HTML\n", " soup = BeautifulSoup(response.text, 'html.parser')\n", "\n", " # Remove scripts and styles\n", " for tag in soup(['script', 'style', 'noscript']):\n", " tag.decompose()\n", "\n", " # Extract visible text\n", " text = soup.get_text(separator='\\n')\n", " clean_text = '\\n'.join(line.strip() for line in text.splitlines() if line.strip())\n", "\n", " return clean_text\n", "\n", "# Example usage\n", "url = 'https://docs.docker.com' # Replace with your URL\n", "page_text = extract_text_from_url(url)\n", "\n", "# Save to file (optional)\n", "with open('extracted_text.txt', 'w', encoding='utf-8') as f:\n", " f.write(page_text)\n", "\n", "print(\"✅ Text extracted successfully.\")\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "1c9e088f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Text extracted and saved to ise_curric.txt\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "def extract_text_from_html_file(html_path):\n", " # Load the HTML file\n", " with open(html_path, \"r\", encoding=\"utf-8\") as file:\n", " soup = BeautifulSoup(file, \"html.parser\")\n", "\n", " # Remove non-visible elements\n", " for tag in soup([\"script\", \"style\", \"noscript\"]):\n", " tag.decompose()\n", "\n", " # Extract visible text\n", " text = soup.get_text(separator=\"\\n\")\n", " clean_text = \"\\n\".join(line.strip() for line in text.splitlines() if line.strip())\n", "\n", " return clean_text\n", "\n", "# Example usage\n", "html_file_path = \"ISE.html\" # e.g., \"./IBT.html\"\n", "output_file_path = \"ise_curric.txt\"\n", "\n", "text = extract_text_from_html_file(html_file_path)\n", "\n", "# Save to file\n", "with open(output_file_path, \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)\n", "\n", "print(f\"✅ Text extracted and saved to {output_file_path}\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "950214e2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['This content covers courses that students normally should take from 1st Year, 1st Semester and 1st Year, 2nd Semester.\\n\\nCourses included:\\n\\n- Basic Korean (GEE3003) - 3 credits - Basic General Education\\n- English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\\n- Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education\\n- Phronesis Seminar (GEB1116) - 2 credits - Basic General Education\\n- Software Programming (IGS1131) - 3 credits - Major Related General Education\\n- Understanding of Economics (GEE2005) - 3 credits - Core General Education\\n- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Related General Education\\n- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\\n- Marketing Software application (IGS1102) - 3 credits - Major Related General Education\\n- Principles of Business Administration (CBA1102) - 3 credits - Major Related General Education\\n- Reading Seminar: Humans, Value, Coexistence (GED1007) - 3 credits - Core General Education\\n\\n', 'This content covers courses that students normally should take from 1st Year, 2nd Semester and 2nd Year, 1st Semester, or 3rd semester.\\n\\nCourses included:\\n\\n- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Related General Education\\n- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\\n- Marketing Software application (IGS1102) - 3 credits - Major Related General Education\\n- Principles of Business Administration (CBA1102) - 3 credits - Major Related General Education\\n- Reading Seminar: Humans, Value, Coexistence (GED1007) - 3 credits - Core General Education\\n- Academic Korean for Foreigners Ⅰ (GEF1105) - 3 credits - Basic General Education\\n- Business English (IBT2101) - 3 credits - Major elective\\n- Business Statistics (IGS2107) - 3 credits - Major Related General Education\\n- Global Economy (IBT2107) - 3 credits - Major elective\\n- International Event Management (IBT2104) - 3 credits - Major elective\\n- International Human Resources Management (IBT2108) - 3 credits - Major elective\\n- International Trade (IBT2106) - 3 credits - Major elective\\n- Introduction to Accounting (IBT1201) - 3 credits - Major Required\\n- Korean Society and Politics (GEE2003) - 3 credits - Core General Education\\n- Organizational Behavior (IBT3112) - 3 credits - Major RequiredThis period transitions from foundational studies to more specialized major-related general education and initial major electives. Students expand their understanding of business contexts, cultural communication, and various aspects of international business.\\n\\n', 'This content covers courses that students normally should take from 2nd Year, 1st Semester, or 3rd semester and 2nd Year, 2nd Semester, or 4th Semester.\\n\\nCourses included:\\n\\n- Academic Korean for Foreigners Ⅰ (GEF1105) - 3 credits - Basic General Education\\n- Business English (IBT2101) - 3 credits - Major elective\\n- Business Statistics (IGS2107) - 3 credits - Major Related General Education\\n- Global Economy (IBT2107) - 3 credits - Major elective\\n- International Event Management (IBT2104) - 3 credits - Major elective\\n- International Human Resources Management (IBT2108) - 3 credits - Major elective\\n- International Trade (IBT2106) - 3 credits - Major elective\\n- Introduction to Accounting (IBT1201) - 3 credits - Major Required\\n- Korean Society and Politics (GEE2003) - 3 credits - Core General Education\\n- Organizational Behavior (IBT3112) - 3 credits - Major Required\\n- Academic Korean for Foreigners Ⅱ (GEF1106) - 3 credits - Core General Education\\n- Advertising (IBT2203) - 3 credits - Major elective\\n- Global Operation (IBT2202) - 3 credits - Major elective\\n- Global Retail Management (IBT2206) - 3 credits - Major elective\\n- International Business Strategy (IBT2102) - 3 credits - Major Required\\n- International Finance and Banking (IBT2201) - 3 credits - Major electiveDuring these semesters, students delve deeper into their major, focusing on required courses and a wide array of electives. Key areas include business statistics, global economics, international trade, accounting, and strategic aspects of international business.\\n\\n', 'This content covers courses that students normally should take from 2nd Year, 2nd Semester, or 4th Semester and 3rd Year, 1st Semester, or 5th Semester.\\n\\nCourses included:\\n\\n- Academic Korean for Foreigners Ⅱ (GEF1106) - 3 credits - Core General Education\\n- Advertising (IBT2203) - 3 credits - Major elective\\n- Global Operation (IBT2202) - 3 credits - Major elective\\n- Global Retail Management (IBT2206) - 3 credits - Major elective\\n- International Business Strategy (IBT2102) - 3 credits - Major Required\\n- International Finance and Banking (IBT2201) - 3 credits - Major elective\\n- Basic programing & practice (IBT3107) - 3 credits - Major elective\\n- Case Studies in International Business (IBT3105) - 3 credits - Major elective\\n- E-Business Management (IBT3102) - 3 credits - Major elective\\n- Global Consumer Behavior (IBT3110) - 3 credits - Major elective\\n- Global Supply Management (IBT3106) - 3 credits - Major elective\\n- International Service Management (IBT3113) - 3 credits - Major Required\\n- Marketing Research (IBT3108) - 3 credits - Major electiveThis phase marks a significant progression into advanced major-specific topics, blending required courses with specialized electives. Students explore global operations, strategic management, fundamental programming, and diverse aspects of international business case studies and marketing.\\n\\n', 'This content covers courses that students normally should take from 3rd Year, 1st Semester, or 5th Semester and 3rd Year, 2nd Semester, or 6th Semester.\\n\\nCourses included:\\n\\n- Basic programing & practice (IBT3107) - 3 credits - Major elective\\n- Case Studies in International Business (IBT3105) - 3 credits - Major elective\\n- E-Business Management (IBT3102) - 3 credits - Major elective\\n- Global Consumer Behavior (IBT3110) - 3 credits - Major elective\\n- Global Supply Management (IBT3106) - 3 credits - Major elective\\n- International Service Management (IBT3113) - 3 credits - Major Required\\n- Marketing Research (IBT3108) - 3 credits - Major elective\\n- CSR Strategy and Cases (IBT3204) - 3 credits - Major elective\\n- DBMS 활용 (IBT3205) - 3 credits - Major elective\\n- Digital Marketing (IBT3206) - 3 credits - Major elective\\n- International Marketing (IBT3201) - 3 credits - Major Required\\n- Logistics Management (IBT3202) - 3 credits - Major RequiredThese semesters concentrate on core major requirements and a variety of specialized electives in international business. Emphasis is placed on practical applications, digital strategies, and critical management areas like marketing, logistics, and database utilization.\\n\\n', 'This content covers courses that students normally should take from 3rd Year, 2nd Semester, or 6th Semester and 4th Year, 1st Semester, or 7th Semester.\\n\\nCourses included:\\n\\n- CSR Strategy and Cases (IBT3204) - 3 credits - Major elective\\n- DBMS 활용 (IBT3205) - 3 credits - Major elective\\n- Digital Marketing (IBT3206) - 3 credits - Major elective\\n- International Marketing (IBT3201) - 3 credits - Major Required\\n- Logistics Management (IBT3202) - 3 credits - Major Required\\n- Advanced Business Statistics: Business Data Analytics using R Programming (IBT4109) - 3 credits - Major elective\\n- Advanced International Business Communication (IBT4110) - 3 credits - Major elective\\n- Big data analysis (IBT4105) - 3 credits - Major elective\\n- Design Thinking (IBT4111) - 3 credits - Major elective\\n- IT and Patent (IBT4103) - 3 credits - Major elective\\n- International Business & Trade Issues (IBT4101) - 3 credits - Major elective\\n- Marketing Communications Campaign (IBT4106) - 3 credits - Major elective\\n- Technology Innovation & Enterprenueship (IBT4108) - 3 credits - Major electiveThis period shifts towards advanced analytical and strategic skills, including data analytics, communication, and innovation. Students engage with complex topics like digital marketing, international trade issues, big data, and entrepreneurial thinking.\\n\\n', 'This content covers courses that students normally should take from 4th Year, 1st Semester, or 7th Semester and 4th Year, 2nd Semester, or 8th Semester.\\n\\nCourses included:\\n\\n- Advanced Business Statistics: Business Data Analytics using R Programming (IBT4109) - 3 credits - Major elective\\n- Advanced International Business Communication (IBT4110) - 3 credits - Major elective\\n- Big data analysis (IBT4105) - 3 credits - Major elective\\n- Design Thinking (IBT4111) - 3 credits - Major elective\\n- IT and Patent (IBT4103) - 3 credits - Major elective\\n- International Business & Trade Issues (IBT4101) - 3 credits - Major elective\\n- Marketing Communications Campaign (IBT4106) - 3 credits - Major elective\\n- Technology Innovation & Enterprenueship (IBT4108) - 3 credits - Major elective\\n- Artificial Intelligence (IBT4203) - 3 credits - Major elective\\n- Convergence of Business Administration & Engineering Capstone Design (Practice) (IBT4207) - 1 credits - Major elective\\n- Convergence of Business Administration & Engineering Capstone Design (Theory) (IBT4206) - 3 credits - Major elective\\n- Crisis Communication and Management (IBT4209) - 3 credits - Major elective\\n- IT and Business (IBT4201) - 3 credits - Major elective\\n- International Business Research Capstone Project (IBT4205) - 3 credits - Major elective\\n- International Logistics (IBT4202) - 3 credits - Major elective\\n- Technology Management (IBT4208) - 3 credits - Major electiveThe final semesters focus on advanced and integrative topics, culminating in capstone projects and specialized electives. Students explore emerging technologies, strategic problem-solving, and practical applications in international business, preparing for professional roles.\\n\\n']\n" ] } ], "source": [ "semesters_data = [\n", " {\n", " \"name\": \"1st Year, 1st Semester\",\n", " \"courses\": [\n", " \"Basic Korean (GEE3003) - 3 credits - Basic General Education\",\n", " \"English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\",\n", " \"Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education\",\n", " \"Phronesis Seminar (GEB1116) - 2 credits - Basic General Education\",\n", " \"Software Programming (IGS1131) - 3 credits - Major Related General Education\",\n", " \"Understanding of Economics (GEE2005) - 3 credits - Core General Education\"\n", " ]\n", " },\n", " {\n", " \"name\": \"1st Year, 2nd Semester\",\n", " \"courses\": [\n", " \"Business Context and Cultural Communication (IGS1204) - 3 credits - Major Related General Education\",\n", " \"Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\",\n", " \"Marketing Software application (IGS1102) - 3 credits - Major Related General Education\",\n", " \"Principles of Business Administration (CBA1102) - 3 credits - Major Related General Education\",\n", " \"Reading Seminar: Humans, Value, Coexistence (GED1007) - 3 credits - Core General Education\"\n", " ]\n", " },\n", " {\n", " \"name\": \"2nd Year, 1st Semester, or 3rd semester\",\n", " \"courses\": [\n", " \"Academic Korean for Foreigners Ⅰ (GEF1105) - 3 credits - Basic General Education\",\n", " \"Business English (IBT2101) - 3 credits - Major elective\",\n", " \"Business Statistics (IGS2107) - 3 credits - Major Related General Education\",\n", " \"Global Economy (IBT2107) - 3 credits - Major elective\",\n", " \"International Event Management (IBT2104) - 3 credits - Major elective\",\n", " \"International Human Resources Management (IBT2108) - 3 credits - Major elective\",\n", " \"International Trade (IBT2106) - 3 credits - Major elective\",\n", " \"Introduction to Accounting (IBT1201) - 3 credits - Major Required\",\n", " \"Korean Society and Politics (GEE2003) - 3 credits - Core General Education\",\n", " \"Organizational Behavior (IBT3112) - 3 credits - Major Required\"\n", " ]\n", " },\n", " {\n", " \"name\": \"2nd Year, 2nd Semester, or 4th Semester\",\n", " \"courses\": [\n", " \"Academic Korean for Foreigners Ⅱ (GEF1106) - 3 credits - Core General Education\",\n", " \"Advertising (IBT2203) - 3 credits - Major elective\",\n", " \"Global Operation (IBT2202) - 3 credits - Major elective\",\n", " \"Global Retail Management (IBT2206) - 3 credits - Major elective\",\n", " \"International Business Strategy (IBT2102) - 3 credits - Major Required\",\n", " \"International Finance and Banking (IBT2201) - 3 credits - Major elective\"\n", " ]\n", " },\n", " {\n", " \"name\": \"3rd Year, 1st Semester, or 5th Semester\",\n", " \"courses\": [\n", " \"Basic programing & practice (IBT3107) - 3 credits - Major elective\",\n", " \"Case Studies in International Business (IBT3105) - 3 credits - Major elective\",\n", " \"E-Business Management (IBT3102) - 3 credits - Major elective\",\n", " \"Global Consumer Behavior (IBT3110) - 3 credits - Major elective\",\n", " \"Global Supply Management (IBT3106) - 3 credits - Major elective\",\n", " \"International Service Management (IBT3113) - 3 credits - Major Required\",\n", " \"Marketing Research (IBT3108) - 3 credits - Major elective\"\n", " ]\n", " },\n", " {\n", " \"name\": \"3rd Year, 2nd Semester, or 6th Semester\",\n", " \"courses\": [\n", " \"CSR Strategy and Cases (IBT3204) - 3 credits - Major elective\",\n", " \"DBMS 활용 (IBT3205) - 3 credits - Major elective\",\n", " \"Digital Marketing (IBT3206) - 3 credits - Major elective\",\n", " \"International Marketing (IBT3201) - 3 credits - Major Required\",\n", " \"Logistics Management (IBT3202) - 3 credits - Major Required\"\n", " ]\n", " },\n", " {\n", " \"name\": \"4th Year, 1st Semester, or 7th Semester\",\n", " \"courses\": [\n", " \"Advanced Business Statistics: Business Data Analytics using R Programming (IBT4109) - 3 credits - Major elective\",\n", " \"Advanced International Business Communication (IBT4110) - 3 credits - Major elective\",\n", " \"Big data analysis (IBT4105) - 3 credits - Major elective\",\n", " \"Design Thinking (IBT4111) - 3 credits - Major elective\",\n", " \"IT and Patent (IBT4103) - 3 credits - Major elective\",\n", " \"International Business & Trade Issues (IBT4101) - 3 credits - Major elective\",\n", " \"Marketing Communications Campaign (IBT4106) - 3 credits - Major elective\",\n", " \"Technology Innovation & Enterprenueship (IBT4108) - 3 credits - Major elective\"\n", " ]\n", " },\n", " {\n", " \"name\": \"4th Year, 2nd Semester, or 8th Semester\",\n", " \"courses\": [\n", " \"Artificial Intelligence (IBT4203) - 3 credits - Major elective\",\n", " \"Convergence of Business Administration & Engineering Capstone Design (Practice) (IBT4207) - 1 credits - Major elective\",\n", " \"Convergence of Business Administration & Engineering Capstone Design (Theory) (IBT4206) - 3 credits - Major elective\",\n", " \"Crisis Communication and Management (IBT4209) - 3 credits - Major elective\",\n", " \"IT and Business (IBT4201) - 3 credits - Major elective\",\n", " \"International Business Research Capstone Project (IBT4205) - 3 credits - Major elective\",\n", " \"International Logistics (IBT4202) - 3 credits - Major elective\",\n", " \"Technology Management (IBT4208) - 3 credits - Major elective\"\n", " ]\n", " }\n", "]\n", "\n", "descriptions = [\n", " \"\",\n", " \"This period transitions from foundational studies to more specialized major-related general education and initial major electives. Students expand their understanding of business contexts, cultural communication, and various aspects of international business.\",\n", " \"During these semesters, students delve deeper into their major, focusing on required courses and a wide array of electives. Key areas include business statistics, global economics, international trade, accounting, and strategic aspects of international business.\",\n", " \"This phase marks a significant progression into advanced major-specific topics, blending required courses with specialized electives. Students explore global operations, strategic management, fundamental programming, and diverse aspects of international business case studies and marketing.\",\n", " \"These semesters concentrate on core major requirements and a variety of specialized electives in international business. Emphasis is placed on practical applications, digital strategies, and critical management areas like marketing, logistics, and database utilization.\",\n", " \"This period shifts towards advanced analytical and strategic skills, including data analytics, communication, and innovation. Students engage with complex topics like digital marketing, international trade issues, big data, and entrepreneurial thinking.\",\n", " \"The final semesters focus on advanced and integrative topics, culminating in capstone projects and specialized electives. Students explore emerging technologies, strategic problem-solving, and practical applications in international business, preparing for professional roles.\"\n", "]\n", "\n", "output_for_rag_ibt = []\n", "\n", "for i in range(len(semesters_data) - 1):\n", " sem1 = semesters_data[i]\n", " sem2 = semesters_data[i+1]\n", "\n", " abstract_desc = descriptions[i]\n", " \n", " # Precise semester mention as requested\n", " semesters_mention = f\"This content covers courses that students normally should take from {sem1['name']} and {sem2['name']}.\"\n", "\n", " # Combine courses from both semesters, removing duplicates if any (though unlikely for distinct semesters)\n", " combined_courses = list(sem1[\"courses\"]) + list(sem2[\"courses\"])\n", " # Convert to a set to remove duplicates, then back to list if order isn't critical\n", " # For this dataset, courses are distinct per semester, so simple concatenation is fine.\n", " \n", " combined_courses_str = \"\\n- \" + \"\\n- \".join(combined_courses)\n", "\n", " entry = (\n", " f\"{semesters_mention}\\n\\n\"\n", " f\"Courses included:\\n{combined_courses_str}\"\n", " f\"{abstract_desc}\\n\\n\"\n", " \n", " \n", " )\n", " output_for_rag_ibt.append(entry)\n", "\n", "print(output_for_rag_ibt)" ] }, { "cell_type": "code", "execution_count": 5, "id": "964f7337", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'This content covers courses that students normally should take from 1st Year, 2nd Semester and 2nd Year, 1st Semester, or 3rd semester.\\n\\nCourses included:\\n\\n- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Related General Education\\n- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\\n- Marketing Software application (IGS1102) - 3 credits - Major Related General Education\\n- Principles of Business Administration (CBA1102) - 3 credits - Major Related General Education\\n- Reading Seminar: Humans, Value, Coexistence (GED1007) - 3 credits - Core General Education\\n- Academic Korean for Foreigners Ⅰ (GEF1105) - 3 credits - Basic General Education\\n- Business English (IBT2101) - 3 credits - Major elective\\n- Business Statistics (IGS2107) - 3 credits - Major Related General Education\\n- Global Economy (IBT2107) - 3 credits - Major elective\\n- International Event Management (IBT2104) - 3 credits - Major elective\\n- International Human Resources Management (IBT2108) - 3 credits - Major elective\\n- International Trade (IBT2106) - 3 credits - Major elective\\n- Introduction to Accounting (IBT1201) - 3 credits - Major Required\\n- Korean Society and Politics (GEE2003) - 3 credits - Core General Education\\n- Organizational Behavior (IBT3112) - 3 credits - Major RequiredThis period transitions from foundational studies to more specialized major-related general education and initial major electives. Students expand their understanding of business contexts, cultural communication, and various aspects of international business.\\n\\n'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_for_rag_ibt[1]" ] }, { "cell_type": "code", "execution_count": 19, "id": "692b2f97", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(output_for_rag_ibt)" ] }, { "cell_type": "code", "execution_count": null, "id": "3131da62", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "id": "3c63a65d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1st Year, 1st Semester': ['1st Year, 1st Semester',\n", " 'English Communication: Advanced (GEB1109) - 3 credits - Basic General Education',\n", " 'Phronesis Seminar (GEB1116) - 2 credits - Basic General Education',\n", " 'Basic Korean (GEE3003) - 3 credits - Basic General Education',\n", " 'Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education',\n", " 'Software Programming (IGS1131) - 3 credits - Major Related General Education'],\n", " '1st Year, 2nd Semester': ['1st Year, 2nd Semester',\n", " 'Intermidiate Korean (GEE3004) - 3 credits - Basic General Education',\n", " 'Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education',\n", " 'General Physics (IGS1230) - 3 credits - Major Related General Education',\n", " 'Computer Programming (IGS1232) - 3 credits - Major Related General Education',\n", " 'Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education',\n", " 'Introduction to ISE (ISE1233) - 3 credits - Major Required',\n", " 'Engineering Concept Drawing (ISE1234) - 3 credits - Major Required'],\n", " '2nd Year, 1st Semester': ['2nd Year, 1st Semester',\n", " 'Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education',\n", " 'Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education',\n", " 'Signal and System Design (IGS2133) - 3 credits - Major Related General Education',\n", " 'Creative ISE Design (ISE2132) - 3 credits - Major Required',\n", " 'Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required',\n", " 'Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required',\n", " 'V2X Structural Analysis (ISE2134) -3 credits - Major Required'],\n", " '2nd Year, 2nd Semester': ['2nd Year, 2nd Semester',\n", " 'Linear Algebra (ACE2105) - 3 credits - Major Related General Education',\n", " 'Data Structures (ISE2235, ISS Track) - 3 credits - Major Required',\n", " 'Computer Networks (ISE2232) - 3 credits - Major Elective',\n", " 'Digital Logic and Design (ISE2231) - 3 credits - Major Elective',\n", " 'Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective',\n", " 'Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective',\n", " 'IT and Patent (IBT4107) - 3 credits - Major Elective'],\n", " '3rd Year, 1st Semester': ['3rd Year, 1st Semester',\n", " 'Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education',\n", " 'Probability and Statistics (IGS3130) - 3 credits - Major Related General Education',\n", " 'VIP (Theory) (ISE3140) - 3 credits - Major Required',\n", " 'VIP (Practice) (ISE3141) - 1 credit - Major Required',\n", " 'IoT Application System (ISE3138) - 3 credits - Major Elective',\n", " 'Data Communication (ISE3133) - 3 credits - Major Elective',\n", " 'Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective',\n", " 'Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective',\n", " 'Cyber Issues and Law (TBD) - 3 credits - Major Elective'],\n", " '3rd Year, 2nd Semester': ['3rd Year, 2nd Semester',\n", " 'Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education',\n", " 'Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education',\n", " 'Software Engineering (ISE4133) - 3 credits - Major Required',\n", " 'Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education',\n", " 'Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective',\n", " 'V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective',\n", " 'Sensor Engineering (ISE3231) - 3 credits - Major Elective',\n", " 'Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective'],\n", " '4th Year, 1st Semester': ['4th Year, 1st Semester',\n", " 'Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education',\n", " 'Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective',\n", " 'Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective',\n", " 'AI Application System (ISE4132) - 3 credits - Major Elective',\n", " 'Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective',\n", " 'Digital Signal Processing (ISE4136) - 3 credits - Major Elective',\n", " 'Computer Security (ISE4137) - 3 credits - Major Elective',\n", " 'Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective'],\n", " '4th Year, 2nd Semester': ['4th Year, 2nd Semester',\n", " 'Embedded System Design (ISE3132) - 3 credits - Major Elective',\n", " 'Undergraduate Research Practice (TBD) - 3 credits - Major Required',\n", " 'Industry Practice (TBD) - 3 credits - Major Required',\n", " 'Start-up Practice (TBD) - 3 credits - Major Required']}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parse_semester_data(raw_text)" ] }, { "cell_type": "code", "execution_count": 18, "id": "fc125013", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--- Entry 1 ---\n", "This content covers courses that students normally should take from 1st Year, 1st Semester and 1st Year, 2nd Semester.\n", "\n", "Courses included:\n", "- 1st Year, 1st Semester\n", "- English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\n", "- Phronesis Seminar (GEB1116) - 2 credits - Basic General Education\n", "- Basic Korean (GEE3003) - 3 credits - Basic General Education\n", "- Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education\n", "- Software Programming (IGS1131) - 3 credits - Major Related General Education\n", "- 1st Year, 2nd Semester\n", "- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\n", "- Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education\n", "- General Physics (IGS1230) - 3 credits - Major Related General Education\n", "- Computer Programming (IGS1232) - 3 credits - Major Related General Education\n", "- Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education\n", "- Introduction to ISE (ISE1233) - 3 credits - Major Required\n", "- Engineering Concept Drawing (ISE1234) - 3 credits - Major Required\n", "\n", "This period focuses on foundational skills, including basic communication, mathematics, and introductory programming concepts. Students begin to explore core engineering principles and general education requirements, laying the groundwork for more specialized studies.\n", "\n", "==================================================\n", "\n", "--- Entry 2 ---\n", "This content covers courses that students normally should take from 1st Year, 2nd Semester and 2nd Year, 1st Semester.\n", "\n", "Courses included:\n", "- 1st Year, 2nd Semester\n", "- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\n", "- Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education\n", "- General Physics (IGS1230) - 3 credits - Major Related General Education\n", "- Computer Programming (IGS1232) - 3 credits - Major Related General Education\n", "- Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education\n", "- Introduction to ISE (ISE1233) - 3 credits - Major Required\n", "- Engineering Concept Drawing (ISE1234) - 3 credits - Major Required\n", "- 2nd Year, 1st Semester\n", "- Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education\n", "- Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education\n", "- Signal and System Design (IGS2133) - 3 credits - Major Related General Education\n", "- Creative ISE Design (ISE2132) - 3 credits - Major Required\n", "- Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required\n", "- Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required\n", "- V2X Structural Analysis (ISE2134) -3 credits - Major Required\n", "\n", "Transitioning from core foundational studies, students delve deeper into programming paradigms like object-oriented programming and fundamental electronics. This period also introduces specific aspects of Information and Systems Engineering (ISE) through required courses and creative design thinking.\n", "\n", "==================================================\n", "\n", "--- Entry 3 ---\n", "This content covers courses that students normally should take from 2nd Year, 1st Semester and 2nd Year, 2nd Semester.\n", "\n", "Courses included:\n", "- 2nd Year, 1st Semester\n", "- Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education\n", "- Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education\n", "- Signal and System Design (IGS2133) - 3 credits - Major Related General Education\n", "- Creative ISE Design (ISE2132) - 3 credits - Major Required\n", "- Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required\n", "- Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required\n", "- V2X Structural Analysis (ISE2134) -3 credits - Major Required\n", "- 2nd Year, 2nd Semester\n", "- Linear Algebra (ACE2105) - 3 credits - Major Related General Education\n", "- Data Structures (ISE2235, ISS Track) - 3 credits - Major Required\n", "- Computer Networks (ISE2232) - 3 credits - Major Elective\n", "- Digital Logic and Design (ISE2231) - 3 credits - Major Elective\n", "- Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective\n", "- Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective\n", "- IT and Patent (IBT4107) - 3 credits - Major Elective\n", "\n", "In these semesters, students solidify their understanding of essential engineering concepts, moving into areas like linear algebra, data structures, and digital logic. Electives begin to offer initial specialization paths within computer networks or dynamic mechanics, expanding their technical breadth.\n", "\n", "==================================================\n", "\n", "--- Entry 4 ---\n", "This content covers courses that students normally should take from 2nd Year, 2nd Semester and 3rd Year, 1st Semester.\n", "\n", "Courses included:\n", "- 2nd Year, 2nd Semester\n", "- Linear Algebra (ACE2105) - 3 credits - Major Related General Education\n", "- Data Structures (ISE2235, ISS Track) - 3 credits - Major Required\n", "- Computer Networks (ISE2232) - 3 credits - Major Elective\n", "- Digital Logic and Design (ISE2231) - 3 credits - Major Elective\n", "- Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective\n", "- Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective\n", "- IT and Patent (IBT4107) - 3 credits - Major Elective\n", "- 3rd Year, 1st Semester\n", "- Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education\n", "- Probability and Statistics (IGS3130) - 3 credits - Major Related General Education\n", "- VIP (Theory) (ISE3140) - 3 credits - Major Required\n", "- VIP (Practice) (ISE3141) - 1 credit - Major Required\n", "- IoT Application System (ISE3138) - 3 credits - Major Elective\n", "- Data Communication (ISE3133) - 3 credits - Major Elective\n", "- Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective\n", "- Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective\n", "- Cyber Issues and Law (TBD) - 3 credits - Major Elective\n", "\n", "This phase marks a significant shift towards more advanced and specialized topics, including probability, statistics, and initial project-based learning (VIP). Students also gain exposure to emerging fields like IoT applications, data communication, and ethical considerations in engineering.\n", "\n", "==================================================\n", "\n", "--- Entry 5 ---\n", "This content covers courses that students normally should take from 3rd Year, 1st Semester and 3rd Year, 2nd Semester.\n", "\n", "Courses included:\n", "- 3rd Year, 1st Semester\n", "- Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education\n", "- Probability and Statistics (IGS3130) - 3 credits - Major Related General Education\n", "- VIP (Theory) (ISE3140) - 3 credits - Major Required\n", "- VIP (Practice) (ISE3141) - 1 credit - Major Required\n", "- IoT Application System (ISE3138) - 3 credits - Major Elective\n", "- Data Communication (ISE3133) - 3 credits - Major Elective\n", "- Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective\n", "- Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective\n", "- Cyber Issues and Law (TBD) - 3 credits - Major Elective\n", "- 3rd Year, 2nd Semester\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education\n", "- Software Engineering (ISE4133) - 3 credits - Major Required\n", "- Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education\n", "- Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective\n", "- V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective\n", "- Sensor Engineering (ISE3231) - 3 credits - Major Elective\n", "- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective\n", "\n", "Building on prior knowledge, students engage with software engineering principles and practical lab experiences in smart mobility. Core and general education requirements also progress, along with options for specialized electives in sensor engineering, V2X communication, and business contexts.\n", "\n", "==================================================\n", "\n", "--- Entry 6 ---\n", "This content covers courses that students normally should take from 3rd Year, 2nd Semester and 4th Year, 1st Semester.\n", "\n", "Courses included:\n", "- 3rd Year, 2nd Semester\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education\n", "- Software Engineering (ISE4133) - 3 credits - Major Required\n", "- Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education\n", "- Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective\n", "- V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective\n", "- Sensor Engineering (ISE3231) - 3 credits - Major Elective\n", "- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective\n", "- 4th Year, 1st Semester\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective\n", "- Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective\n", "- AI Application System (ISE4132) - 3 credits - Major Elective\n", "- Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective\n", "- Digital Signal Processing (ISE4136) - 3 credits - Major Elective\n", "- Computer Security (ISE4137) - 3 credits - Major Elective\n", "- Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective\n", "\n", "As students approach graduation, the focus shifts to advanced theoretical concepts and culminating capstone projects that integrate engineering with business administration. Specializations become more defined with electives in AI, digital signal processing, and computer security, preparing for complex real-world challenges.\n", "\n", "==================================================\n", "\n", "--- Entry 7 ---\n", "This content covers courses that students normally should take from 4th Year, 1st Semester and 4th Year, 2nd Semester.\n", "\n", "Courses included:\n", "- 4th Year, 1st Semester\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective\n", "- Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective\n", "- AI Application System (ISE4132) - 3 credits - Major Elective\n", "- Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective\n", "- Digital Signal Processing (ISE4136) - 3 credits - Major Elective\n", "- Computer Security (ISE4137) - 3 credits - Major Elective\n", "- Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective\n", "- 4th Year, 2nd Semester\n", "- Embedded System Design (ISE3132) - 3 credits - Major Elective\n", "- Undergraduate Research Practice (TBD) - 3 credits - Major Required\n", "- Industry Practice (TBD) - 3 credits - Major Required\n", "- Start-up Practice (TBD) - 3 credits - Major Required\n", "\n", "The final semesters are dedicated to culminating projects and practical experiences, including undergraduate research, industry internships, and start-up practices. Students apply their accumulated knowledge in embedded systems design and prepare comprehensively for their professional careers.\n", "\n", "==================================================\n", "\n" ] } ], "source": [ "import re\n", "\n", "def parse_semester_data(text):\n", " \"\"\"\n", " Parses the raw text containing semester and course information\n", " into a dictionary.\n", " Keys are semester names (e.g., \"1st Year, 1st Semester\") and\n", " values are lists of course strings (e.g., \"English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\").\n", " \"\"\"\n", " semesters = {}\n", " current_semester = None\n", " lines = text.strip().split('\\n')\n", "\n", " for line in lines:\n", " line = line.strip()\n", " if line.startswith('**') and line.endswith(':**'):\n", " # Extract semester name, remove '**' and ':'\n", " current_semester = line.strip('**:')\n", " semesters[current_semester] = [f\"{current_semester:} below: \"]\n", " elif line.startswith('- ') and current_semester:\n", " # Extract course information, remove '- ' prefix\n", " semesters[current_semester].append(line.lstrip('- ').strip())\n", " return semesters\n", "\n", "def generate_rag_entries(semester_data, semester_names_ordered, descriptions):\n", " \"\"\"\n", " Generates a list of strings, each representing two overlapping semesters\n", " formatted specifically for a RAG LLM system. Each string includes:\n", " 1. An explicit description of the semesters covered.\n", " 2. A list of all courses from both semesters combined.\n", " 3. An abstract summary description repeated at the end.\n", "\n", " Args:\n", " semester_data (dict): A dictionary where keys are semester names and\n", " values are lists of course strings.\n", " semester_names_ordered (list): An ordered list of semester names\n", " to ensure correct pairing and overlap.\n", " descriptions (dict): A dictionary mapping (semester1_name, semester2_name)\n", " tuples to their two-sentence abstract descriptions.\n", "\n", " Returns:\n", " list: A list of strings, each formatted as required for RAG embeddings.\n", " \"\"\"\n", " rag_entries = []\n", " num_semesters = len(semester_names_ordered)\n", "\n", " # Iterate through semester pairs for overlapping content\n", " for i in range(num_semesters - 1):\n", " sem1_name = semester_names_ordered[i]\n", " sem2_name = semester_names_ordered[i+1]\n", "\n", " # 1. Explicit semester mention at the beginning\n", " # e.g., \"This content covers courses that students normally should take from 1st Year, 1st Semester and 1st Year, 2nd Semester.\"\n", " first_sentence_explicit_semesters = f\"This content covers courses that students normally should take from {sem1_name} and {sem2_name}.\"\n", " \n", " # 2. Abstract description (will also be repeated at the end)\n", " abstract_description_text = descriptions.get(\n", " (sem1_name, sem2_name),\n", " \"This period combines foundational and advanced coursework, preparing students with a broad skillset and specialized knowledge.\"\n", " )\n", "\n", " # Start building the entry string based on the example format\n", " entry_string = f\"{first_sentence_explicit_semesters}\\n\\n\" # Blank line after explicit semester mention\n", " entry_string += f\"Courses included:\\n\" # \"Courses included:\" header, no extra blank line after this\n", "\n", " # Collect all courses from both semesters into a single list\n", " all_courses_in_pair = []\n", " all_courses_in_pair.extend(semester_data.get(sem1_name, []))\n", " all_courses_in_pair.extend(semester_data.get(sem2_name, []))\n", "\n", " # Add each course as a list item\n", " for course in all_courses_in_pair:\n", " entry_string += f\"- {course}\\n\"\n", " \n", " # 3. Repeat the abstract description at the end\n", " entry_string += f\"\\n{abstract_description_text}\" \n", " \n", " # Add the complete entry string to the list, stripping any trailing whitespace\n", " rag_entries.append(entry_string.strip())\n", "\n", " return rag_entries\n", "\n", "# Raw input text containing all semester course information\n", "raw_text = \"\"\"\n", "**1st Year, 1st Semester:**\n", "- English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\n", "- Phronesis Seminar (GEB1116) - 2 credits - Basic General Education\n", "- Basic Korean (GEE3003) - 3 credits - Basic General Education\n", "- Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education\n", "- Software Programming (IGS1131) - 3 credits - Major Related General Education\n", "\n", "**1st Year, 2nd Semester:**\n", "- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\n", "- Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education\n", "- General Physics (IGS1230) - 3 credits - Major Related General Education\n", "- Computer Programming (IGS1232) - 3 credits - Major Related General Education\n", "- Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education\n", "- Introduction to ISE (ISE1233) - 3 credits - Major Required\n", "- Engineering Concept Drawing (ISE1234) - 3 credits - Major Required\n", "\n", "**2nd Year, 1st Semester:**\n", "- Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education\n", "- Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education\n", "- Signal and System Design (IGS2133) - 3 credits - Major Related General Education\n", "- Creative ISE Design (ISE2132) - 3 credits - Major Required\n", "- Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required\n", "- Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required\n", "- V2X Structural Analysis (ISE2134) -3 credits - Major Required\n", "\n", "**2nd Year, 2nd Semester:**\n", "- Linear Algebra (ACE2105) - 3 credits - Major Related General Education\n", "- Data Structures (ISE2235, ISS Track) - 3 credits - Major Required\n", "- Computer Networks (ISE2232) - 3 credits - Major Elective\n", "- Digital Logic and Design (ISE2231) - 3 credits - Major Elective\n", "- Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective\n", "- Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective\n", "- IT and Patent (IBT4107) - 3 credits - Major Elective\n", "\n", "**3rd Year, 1st Semester:**\n", "- Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education\n", "- Probability and Statistics (IGS3130) - 3 credits - Major Related General Education\n", "- VIP (Theory) (ISE3140) - 3 credits - Major Required\n", "- VIP (Practice) (ISE3141) - 1 credit - Major Required\n", "- IoT Application System (ISE3138) - 3 credits - Major Elective\n", "- Data Communication (ISE3133) - 3 credits - Major Elective\n", "- Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective\n", "- Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective\n", "- Cyber Issues and Law (TBD) - 3 credits - Major Elective\n", "\n", "**3rd Year, 2nd Semester:**\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education\n", "- Software Engineering (ISE4133) - 3 credits - Major Required\n", "- Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education\n", "- Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective\n", "- V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective\n", "- Sensor Engineering (ISE3231) - 3 credits - Major Elective\n", "- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective\n", "\n", "**4th Year, 1st Semester:**\n", "- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\n", "- Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective\n", "- Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective\n", "- AI Application System (ISE4132) - 3 credits - Major Elective\n", "- Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective\n", "- Digital Signal Processing (ISE4136) - 3 credits - Major Elective\n", "- Computer Security (ISE4137) - 3 credits - Major Elective\n", "- Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective\n", "\n", "**4th Year, 2nd Semester:**\n", "\n", "- Embedded System Design (ISE3132) - 3 credits - Major Elective\n", "- Undergraduate Research Practice (TBD) - 3 credits - Major Required\n", "- Industry Practice (TBD) - 3 credits - Major Required\n", "- Start-up Practice (TBD) - 3 credits - Major Required\n", "\"\"\"\n", "\n", "# Parse the raw text data into a structured dictionary\n", "semester_data = parse_semester_data(raw_text)\n", "\n", "# Define the ordered list of semester names. This order dictates the overlap sequence.\n", "semester_names_ordered = [\n", " \"1st Year, 1st Semester\",\n", " \"1st Year, 2nd Semester\",\n", " \"2nd Year, 1st Semester\",\n", " \"2nd Year, 2nd Semester\",\n", " \"3rd Year, 1st Semester\",\n", " \"3rd Year, 2nd Semester\",\n", " \"4th Year, 1st Semester\",\n", " \"4th Year, 2nd Semester\"\n", "]\n", "\n", "# Custom abstract descriptions for each pair of semesters.\n", "# These descriptions summarize the educational focus or typical progression\n", "# for the combined two-semester period.\n", "descriptions = {\n", " (\"1st Year, 1st Semester\", \"1st Year, 2nd Semester\"): \"This period focuses on foundational skills, including basic communication, mathematics, and introductory programming concepts. Students begin to explore core engineering principles and general education requirements, laying the groundwork for more specialized studies.\",\n", " (\"1st Year, 2nd Semester\", \"2nd Year, 1st Semester\"): \"Transitioning from core foundational studies, students delve deeper into programming paradigms like object-oriented programming and fundamental electronics. This period also introduces specific aspects of Information and Systems Engineering (ISE) through required courses and creative design thinking.\",\n", " (\"2nd Year, 1st Semester\", \"2nd Year, 2nd Semester\"): \"In these semesters, students solidify their understanding of essential engineering concepts, moving into areas like linear algebra, data structures, and digital logic. Electives begin to offer initial specialization paths within computer networks or dynamic mechanics, expanding their technical breadth.\",\n", " (\"2nd Year, 2nd Semester\", \"3rd Year, 1st Semester\"): \"This phase marks a significant shift towards more advanced and specialized topics, including probability, statistics, and initial project-based learning (VIP). Students also gain exposure to emerging fields like IoT applications, data communication, and ethical considerations in engineering.\",\n", " (\"3rd Year, 1st Semester\", \"3rd Year, 2nd Semester\"): \"Building on prior knowledge, students engage with software engineering principles and practical lab experiences in smart mobility. Core and general education requirements also progress, along with options for specialized electives in sensor engineering, V2X communication, and business contexts.\",\n", " (\"3rd Year, 2nd Semester\", \"4th Year, 1st Semester\"): \"As students approach graduation, the focus shifts to advanced theoretical concepts and culminating capstone projects that integrate engineering with business administration. Specializations become more defined with electives in AI, digital signal processing, and computer security, preparing for complex real-world challenges.\",\n", " (\"4th Year, 1st Semester\", \"4th Year, 2nd Semester\"): \"The final semesters are dedicated to culminating projects and practical experiences, including undergraduate research, industry internships, and start-up practices. Students apply their accumulated knowledge in embedded systems design and prepare comprehensively for their professional careers.\"\n", "}\n", "\n", "# Generate the final list of strings for RAG LLM embedding\n", "ise_rag_llm_entries = generate_rag_entries(semester_data, semester_names_ordered, descriptions)\n", "\n", "# Return the list of strings\n", "# The following loop is for demonstration purposes to show the output format.\n", "# In a real application, you would directly return `rag_llm_entries`.\n", "if __name__ == \"__main__\":\n", " for i, entry in enumerate(rag_llm_entries):\n", " print(f\"--- Entry {i+1} ---\")\n", " print(entry)\n", " print(\"\\n\" + \"=\"*50 + \"\\n\")\n", "\n", " # To directly return the list as requested:\n", " # return rag_llm_entries " ] }, { "cell_type": "code", "execution_count": 19, "id": "18376b56", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['This content covers courses that students normally should take from 1st Year, 1st Semester and 1st Year, 2nd Semester.\\n\\nCourses included:\\n- 1st Year, 1st Semester below: \\n- English Communication: Advanced (GEB1109) - 3 credits - Basic General Education\\n- Phronesis Seminar (GEB1116) - 2 credits - Basic General Education\\n- Basic Korean (GEE3003) - 3 credits - Basic General Education\\n- Introductory Engineering Mathematics (IGS1130) - 3 credits - Major Related General Education\\n- Software Programming (IGS1131) - 3 credits - Major Related General Education\\n- 1st Year, 2nd Semester below: \\n- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\\n- Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education\\n- General Physics (IGS1230) - 3 credits - Major Related General Education\\n- Computer Programming (IGS1232) - 3 credits - Major Related General Education\\n- Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education\\n- Introduction to ISE (ISE1233) - 3 credits - Major Required\\n- Engineering Concept Drawing (ISE1234) - 3 credits - Major Required\\n\\nThis period focuses on foundational skills, including basic communication, mathematics, and introductory programming concepts. Students begin to explore core engineering principles and general education requirements, laying the groundwork for more specialized studies.',\n", " 'This content covers courses that students normally should take from 1st Year, 2nd Semester and 2nd Year, 1st Semester.\\n\\nCourses included:\\n- 1st Year, 2nd Semester below: \\n- Intermidiate Korean (GEE3004) - 3 credits - Basic General Education\\n- Reading Seminar: Humans, Value, Coexistence (GED3020d) - 3 credits - Core General Education\\n- General Physics (IGS1230) - 3 credits - Major Related General Education\\n- Computer Programming (IGS1232) - 3 credits - Major Related General Education\\n- Discrete Mathematics (ACE1312) - 3 credits - Major Related General Education\\n- Introduction to ISE (ISE1233) - 3 credits - Major Required\\n- Engineering Concept Drawing (ISE1234) - 3 credits - Major Required\\n- 2nd Year, 1st Semester below: \\n- Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education\\n- Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education\\n- Signal and System Design (IGS2133) - 3 credits - Major Related General Education\\n- Creative ISE Design (ISE2132) - 3 credits - Major Required\\n- Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required\\n- Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required\\n- V2X Structural Analysis (ISE2134) -3 credits - Major Required\\n\\nTransitioning from core foundational studies, students delve deeper into programming paradigms like object-oriented programming and fundamental electronics. This period also introduces specific aspects of Information and Systems Engineering (ISE) through required courses and creative design thinking.',\n", " 'This content covers courses that students normally should take from 2nd Year, 1st Semester and 2nd Year, 2nd Semester.\\n\\nCourses included:\\n- 2nd Year, 1st Semester below: \\n- Academic Korean for Foreigners I (GEF1105) - 3 credits - Basic General Education\\n- Object Oriented Programming (IGS2130) - 3 credits - Major Related General Education\\n- Signal and System Design (IGS2133) - 3 credits - Major Related General Education\\n- Creative ISE Design (ISE2132) - 3 credits - Major Required\\n- Basic Circuits and Electronics (ISE2135) - 3 credits - Major Required\\n- Structural Analysis of Mobility (ISE2134, FMS Track) - 3 credits - Major Required\\n- V2X Structural Analysis (ISE2134) -3 credits - Major Required\\n- 2nd Year, 2nd Semester below: \\n- Linear Algebra (ACE2105) - 3 credits - Major Related General Education\\n- Data Structures (ISE2235, ISS Track) - 3 credits - Major Required\\n- Computer Networks (ISE2232) - 3 credits - Major Elective\\n- Digital Logic and Design (ISE2231) - 3 credits - Major Elective\\n- Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective\\n- Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective\\n- IT and Patent (IBT4107) - 3 credits - Major Elective\\n\\nIn these semesters, students solidify their understanding of essential engineering concepts, moving into areas like linear algebra, data structures, and digital logic. Electives begin to offer initial specialization paths within computer networks or dynamic mechanics, expanding their technical breadth.',\n", " 'This content covers courses that students normally should take from 2nd Year, 2nd Semester and 3rd Year, 1st Semester.\\n\\nCourses included:\\n- 2nd Year, 2nd Semester below: \\n- Linear Algebra (ACE2105) - 3 credits - Major Related General Education\\n- Data Structures (ISE2235, ISS Track) - 3 credits - Major Required\\n- Computer Networks (ISE2232) - 3 credits - Major Elective\\n- Digital Logic and Design (ISE2231) - 3 credits - Major Elective\\n- Dynamic Mechanics 1 (ISE2233, FMS Track) - 3 credits - Major Elective\\n- Intelligent V2X Design (ISE2234, FMS Track) - 3 credits - Major Elective\\n- IT and Patent (IBT4107) - 3 credits - Major Elective\\n- 3rd Year, 1st Semester below: \\n- Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education\\n- Probability and Statistics (IGS3130) - 3 credits - Major Related General Education\\n- VIP (Theory) (ISE3140) - 3 credits - Major Required\\n- VIP (Practice) (ISE3141) - 1 credit - Major Required\\n- IoT Application System (ISE3138) - 3 credits - Major Elective\\n- Data Communication (ISE3133) - 3 credits - Major Elective\\n- Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective\\n- Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective\\n- Cyber Issues and Law (TBD) - 3 credits - Major Elective\\n\\nThis phase marks a significant shift towards more advanced and specialized topics, including probability, statistics, and initial project-based learning (VIP). Students also gain exposure to emerging fields like IoT applications, data communication, and ethical considerations in engineering.',\n", " 'This content covers courses that students normally should take from 3rd Year, 1st Semester and 3rd Year, 2nd Semester.\\n\\nCourses included:\\n- 3rd Year, 1st Semester below: \\n- Engineering Ethics and Discussion (GED1002) - 3 credits - Core General Education\\n- Probability and Statistics (IGS3130) - 3 credits - Major Related General Education\\n- VIP (Theory) (ISE3140) - 3 credits - Major Required\\n- VIP (Practice) (ISE3141) - 1 credit - Major Required\\n- IoT Application System (ISE3138) - 3 credits - Major Elective\\n- Data Communication (ISE3133) - 3 credits - Major Elective\\n- Navigation Systems (ISE3134, FMS Track) - 3 credits - Major Elective\\n- Motor Control (ISE3135, FMS Track) - 3 credits - Major Elective\\n- Cyber Issues and Law (TBD) - 3 credits - Major Elective\\n- 3rd Year, 2nd Semester below: \\n- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\\n- Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education\\n- Software Engineering (ISE4133) - 3 credits - Major Required\\n- Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education\\n- Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective\\n- V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective\\n- Sensor Engineering (ISE3231) - 3 credits - Major Elective\\n- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective\\n\\nBuilding on prior knowledge, students engage with software engineering principles and practical lab experiences in smart mobility. Core and general education requirements also progress, along with options for specialized electives in sensor engineering, V2X communication, and business contexts.',\n", " 'This content covers courses that students normally should take from 3rd Year, 2nd Semester and 4th Year, 1st Semester.\\n\\nCourses included:\\n- 3rd Year, 2nd Semester below: \\n- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\\n- Technology Innovation & Entrepreneurship (IBT4108) - 3 credits - Core General Education\\n- Software Engineering (ISE4133) - 3 credits - Major Required\\n- Smart Mobility Engineering Lab (IGS3231) - 3 credits - Major Related General Education\\n- Mobility System Control Practice (ISE3233, FMS Track) - 3 credits - Major Elective\\n- V2X Communication (ISE3234, FMS Track) - 3 credits - Major Elective\\n- Sensor Engineering (ISE3231) - 3 credits - Major Elective\\n- Business Context and Cultural Communication (IGS1204) - 3 credits - Major Elective\\n- 4th Year, 1st Semester below: \\n- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\\n- Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective\\n- Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective\\n- AI Application System (ISE4132) - 3 credits - Major Elective\\n- Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective\\n- Digital Signal Processing (ISE4136) - 3 credits - Major Elective\\n- Computer Security (ISE4137) - 3 credits - Major Elective\\n- Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective\\n\\nAs students approach graduation, the focus shifts to advanced theoretical concepts and culminating capstone projects that integrate engineering with business administration. Specializations become more defined with electives in AI, digital signal processing, and computer security, preparing for complex real-world challenges.',\n", " 'This content covers courses that students normally should take from 4th Year, 1st Semester and 4th Year, 2nd Semester.\\n\\nCourses included:\\n- 4th Year, 1st Semester below: \\n- Academic Korean for Foreigners II (GEF1106) - 3 credits - Core General Education\\n- Convergence of Business Administration & Engineering Capstone Design (Theory) (ISE4206) - 3 credits - Major Elective\\n- Convergence of Business Administration & Engineering Capstone Design (Practice) (ISE4207) - 1 credit - Major Elective\\n- AI Application System (ISE4132) - 3 credits - Major Elective\\n- Capstone Design in Digital Image Processing (ISE4131) - 3 credits - Major Elective\\n- Digital Signal Processing (ISE4136) - 3 credits - Major Elective\\n- Computer Security (ISE4137) - 3 credits - Major Elective\\n- Smart Mobility Service (ISE4134, FMS Track) - 3 credits - Major Elective\\n- 4th Year, 2nd Semester below: \\n- Embedded System Design (ISE3132) - 3 credits - Major Elective\\n- Undergraduate Research Practice (TBD) - 3 credits - Major Required\\n- Industry Practice (TBD) - 3 credits - Major Required\\n- Start-up Practice (TBD) - 3 credits - Major Required\\n\\nThe final semesters are dedicated to culminating projects and practical experiences, including undergraduate research, industry internships, and start-up practices. Students apply their accumulated knowledge in embedded systems design and prepare comprehensively for their professional careers.']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ise_rag_llm_entries" ] }, { "cell_type": "code", "execution_count": null, "id": "45869755", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "be6ef240", "metadata": {}, "source": [ "## scrapping" ] }, { "cell_type": "code", "execution_count": 24, "id": "a928f864", "metadata": {}, "outputs": [], "source": [ "def scrape_and_save(url, retry=0):\n", " full_url = urljoin(BASE_URL, url)\n", " if full_url in visited or retry > 3:\n", " return\n", " visited.add(full_url)\n", "\n", " try:\n", " print(f\"Scraping: {full_url}\")\n", " response = requests.get(full_url, headers=HEADERS, timeout=15)\n", " response.raise_for_status()\n", "\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " text = soup.get_text(separator=\"\\n\", strip=True)\n", "\n", " # ✨ Get semantic filename from first non-empty visible line\n", " first_line = next((line for line in text.split(\"\\n\") if line.strip()), \"unnamed-page\")\n", " filename = quote(first_line[:60], safe='') # limit length and clean\n", "\n", " # Save page content\n", " with open(os.path.join(OUTPUT_DIR, f\"{filename}.txt\"), \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)\n", "\n", " # Find and follow sublinks\n", " for a_tag in soup.find_all(\"a\", href=True):\n", " href = a_tag[\"href\"]\n", " if is_valid_internal(href):\n", " time.sleep(1)\n", " scrape_and_save(href)\n", "\n", " except requests.exceptions.RequestException as e:\n", " print(f\"❌ Error scraping {full_url}: {e}\")\n", " print(\"🔁 Retrying...\")\n", " time.sleep(3)\n", " scrape_and_save(url, retry=retry+1)\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "a54ff0c8", "metadata": {}, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (3085372508.py, line 1)", "output_type": "error", "traceback": [ "\u001b[0;36m Cell \u001b[0;32mIn[21], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m https://sgcs.inha.ac.kr/sgcs/9116/subview.do\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "https://sgcs.inha.ac.kr/sgcs/9116/subview.do" ] }, { "cell_type": "code", "execution_count": 25, "id": "b4065ba1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9119/subview.do\n", "✅ Saved: inha_pages/IBT.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/index.do\n", "✅ Saved: inha_pages/%EC%9D%B8%ED%95%98%EB%8C%80%ED%95%99%EA%B5%90_%EA%B5%AD%EC%A0%9C%ED%95%99%EB%B6%80.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9080/subview.do\n", "✅ Saved: inha_pages/About_INHA_SGCS.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9082/subview.do\n", "✅ Saved: inha_pages/Deans_Welcome.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9091/subview.do\n", "✅ Saved: inha_pages/Contact_How_to_Visit.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9092/subview.do\n", "✅ Saved: inha_pages/Brochure.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9094/subview.do\n", "✅ Saved: inha_pages/Admission.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9112/subview.do\n", "✅ Saved: inha_pages/Scholarship.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9096/subview.do\n", "✅ Saved: inha_pages/Academic_Calendar.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9116/subview.do\n", "✅ Saved: inha_pages/IBT.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9117/subview.do\n", "✅ Saved: inha_pages/ISE.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9118/subview.do\n", "✅ Saved: inha_pages/KLC.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9113/subview.do\n", "✅ Saved: inha_pages/Double_Minor_Major.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9120/subview.do\n", "✅ Saved: inha_pages/ISE.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9121/subview.do\n", "✅ Saved: inha_pages/KLC.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9125/subview.do\n", "✅ Saved: inha_pages/Professors.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9126/subview.do\n", "✅ Saved: inha_pages/Lecturers.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9127/subview.do\n", "✅ Saved: inha_pages/Professors.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9128/subview.do\n", "✅ Saved: inha_pages/Lecturers.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9129/subview.do\n", "✅ Saved: inha_pages/Professors.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9130/subview.do\n", "✅ Saved: inha_pages/Lecturers.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9102/subview.do\n", "✅ Saved: inha_pages/Special_Programs.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9103/subview.do\n", "✅ Saved: inha_pages/Student_Activities.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9104/subview.do\n", "✅ Saved: inha_pages/Dormitory.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9105/subview.do\n", "✅ Saved: inha_pages/Scholarship.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9131/subview.do\n", "✅ Saved: inha_pages/Career_Prospects.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9132/subview.do\n", "✅ Saved: inha_pages/Career_Development_Help.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9133/subview.do\n", "✅ Saved: inha_pages/Notice.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9134/subview.do\n", "✅ Saved: inha_pages/News.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9135/subview.do\n", "✅ Saved: inha_pages/Gallery.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9136/subview.do\n", "✅ Saved: inha_pages/Promotional_Video.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9137/subview.do\n", "✅ Saved: inha_pages/Resources.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9107/subview.do\n", "✅ Saved: inha_pages/Login.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9108/subview.do\n", "✅ Saved: inha_pages/Sitemap.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs//subview.do\n", "✅ Saved: inha_pages/Alert.txt\n", "🔍 Scraping: https://sgcs.inha.ac.kr/sgcs/9091/subview..do\n", "✅ Saved: inha_pages/Contact_How_to_Visit.txt\n", "🎉 Scraping complete.\n" ] } ], "source": [ "import os\n", "import time\n", "import re\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urljoin, urlparse, quote\n", "\n", "# === Configuration ===\n", "BASE_URL = \"https://sgcs.inha.ac.kr\"\n", "START_PATHS = [\n", " \"/sgcs/9119/subview.do\", # About\n", " \"/sgcs/9120/subview.do\", # Admission\n", " \"/sgcs/9121/subview.do\" # Academics\n", "]\n", "OUTPUT_DIR = \"inha_pages\"\n", "visited = set()\n", "\n", "HEADERS = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/115.0.0.0 Safari/537.36\"\n", "}\n", "\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "# === Utility Functions ===\n", "\n", "def is_valid_internal(url):\n", " parsed = urlparse(url)\n", " return parsed.netloc in [\"\", \"sgcs.inha.ac.kr\"] and url.startswith(\"/sgcs/\")\n", "\n", "def clean_filename_from_text(text):\n", " # Use first visible line as filename, clean it\n", " line = text.strip().split(\"\\n\")[0]\n", " line = re.sub(r'[^\\w\\s-]', '', line) # remove special characters\n", " line = re.sub(r'\\s+', '_', line.strip()) # spaces to underscores\n", " return quote(line[:60], safe='') or \"unnamed_page\"\n", "\n", "# === Main Scraper Function ===\n", "\n", "def scrape_and_save(url, retry=0):\n", " full_url = urljoin(BASE_URL, url)\n", " if full_url in visited or retry > 3:\n", " return\n", " visited.add(full_url)\n", "\n", " try:\n", " print(f\"🔍 Scraping: {full_url}\")\n", " response = requests.get(full_url, headers=HEADERS, timeout=15)\n", " response.raise_for_status()\n", "\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " text = soup.get_text(separator=\"\\n\", strip=True)\n", "\n", " if not text.strip():\n", " print(f\"⚠️ Skipping empty page: {full_url}\")\n", " return\n", "\n", " filename = clean_filename_from_text(text)\n", " filepath = os.path.join(OUTPUT_DIR, f\"{filename}.txt\")\n", " \n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)\n", " print(f\"✅ Saved: {filepath}\")\n", "\n", " # Follow internal links\n", " for a_tag in soup.find_all(\"a\", href=True):\n", " href = a_tag[\"href\"]\n", " if is_valid_internal(href):\n", " time.sleep(1) # polite delay\n", " scrape_and_save(href)\n", "\n", " except requests.exceptions.RequestException as e:\n", " print(f\"❌ Error scraping {full_url}: {e}\")\n", " print(\"🔁 Retrying...\")\n", " time.sleep(3)\n", " scrape_and_save(url, retry=retry+1)\n", "\n", "# === Run the Scraper ===\n", "\n", "if __name__ == \"__main__\":\n", " for path in START_PATHS:\n", " scrape_and_save(path)\n", "\n", " print(\"🎉 Scraping complete.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6d1f1cd3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv (3.10.18)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }