| import re |
| import shutil |
| import os |
|
|
| cendu_ipa_pattern = re.compile(r'成都<img src=\"([a-z0-9\/\-\.]+)\.gif\"/>') |
| title_pattern = re.compile(r'<d:entry id=\"[a-z0-9_]+\" d:title=\"([\u4e00-\u9fff]+)\">') |
| img_folder = "cendu_ipas/" |
|
|
| |
| if not os.path.exists(img_folder): |
| os.makedirs(img_folder) |
| else: |
| |
| for filename in os.listdir(img_folder): |
| file_path = os.path.join(img_folder, filename) |
| if os.path.isfile(file_path): |
| os.remove(file_path) |
|
|
| with open("現代漢語方言大詞典.xml", "r") as input_file: |
| lines = input_file.readlines() |
| i = 0 |
| while i < len(lines): |
| match = title_pattern.search(lines[i]) |
| if match: |
| word = match.group(1) |
| i += 4 |
| match = cendu_ipa_pattern.search(lines[i]) |
| if match: |
| print("Processing {}".format(word)) |
| img_src = match.group(1) + ".gif" |
| |
| if os.path.isfile(img_src): |
| shutil.copy(img_src, os.path.join(img_folder, "{}.gif".format(word))) |
| print("Copied {}".format(img_src)) |
| else: |
| print("Source image file {} does not exist.".format(img_src)) |
| i += 1 |
|
|