Spaces:
Running
Running
| """ | |
| Wiki 清洗模块的单元测试。 | |
| """ | |
| from pathlib import Path | |
| from data.wiki.wiki_cleaner import ( | |
| filter_single_line, | |
| filter_html_tags, | |
| filter_empty_brackets, | |
| filter_lang_tags, | |
| clean, | |
| ) | |
| from env.resolve import resolve_path | |
| class TestFilterSingleLine: | |
| """测试单行过滤器""" | |
| def test_single_line_returns_none(self): | |
| """单行文本应该返回 None""" | |
| assert filter_single_line("这是一个重定向") is None | |
| def test_single_line_with_whitespace_returns_none(self): | |
| """单行但包含空白字符应该返回 None""" | |
| assert filter_single_line(" 这是一个重定向 ") is None | |
| def test_multiple_lines_returns_original(self): | |
| """多行文本应该返回原文本""" | |
| text = "第一行\n第二行\n第三行" | |
| assert filter_single_line(text) == text | |
| def test_multiple_lines_with_empty_lines(self): | |
| """多行包含空行应该返回原文本""" | |
| text = "第一行\n\n第二行\n\n" | |
| result = filter_single_line(text) | |
| assert result == text | |
| def test_empty_string_returns_none(self): | |
| """空字符串应该返回 None""" | |
| assert filter_single_line("") is None | |
| def test_only_whitespace_returns_none(self): | |
| """只有空白字符应该返回 None""" | |
| assert filter_single_line(" \n \n ") is None | |
| class TestFilterEmptyBrackets: | |
| """测试空括号过滤器""" | |
| def test_remove_empty_parentheses_in_text(self): | |
| """移除文本中的空括号 ()""" | |
| text = "这是()一段文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_remove_empty_chinese_brackets_in_text(self): | |
| """移除文本中的空中文括号 ()""" | |
| text = "这是()一段文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_remove_brackets_with_space_in_text(self): | |
| """移除带空格的空括号""" | |
| text = "这是( )一段( )文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_keep_brackets_with_content(self): | |
| """保留有内容的括号""" | |
| text = "这是一个(有内容的)括号" | |
| assert filter_empty_brackets(text) == text | |
| def test_remove_square_brackets_in_text(self): | |
| """移除文本中的空方括号 []""" | |
| text = "这是[]一段[ ]文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_remove_chinese_square_brackets_in_text(self): | |
| """移除文本中的空中文方括号 【】""" | |
| text = "这是【】一段文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_remove_curly_brackets_in_text(self): | |
| """移除文本中的空花括号 {}""" | |
| text = "这是{}一段{ }文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本" | |
| def test_no_brackets_returns_original(self): | |
| """没有括号应该返回原文本""" | |
| text = "这是一段普通文本\n没有任何括号" | |
| assert filter_empty_brackets(text) == text | |
| def test_empty_string(self): | |
| """空字符串应该返回空字符串""" | |
| assert filter_empty_brackets("") == "" | |
| def test_multiple_empty_brackets(self): | |
| """移除多个空括号""" | |
| text = "()()[]【】" | |
| result = filter_empty_brackets(text) | |
| assert result == "" | |
| def test_mixed_empty_and_content_brackets(self): | |
| """混合空括号和有内容的括号""" | |
| text = "这是()(有内容的)和[]的测试" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是(有内容的)和的测试" | |
| def test_multiple_lines_with_empty_brackets(self): | |
| """多行文本中的空括号 ()""" | |
| text = "这是()一段文本\n这是()一段文本" | |
| result = filter_empty_brackets(text) | |
| assert result == "这是一段文本\n这是一段文本" | |
| class TestFilterHtmlTags: | |
| """测试 HTML 标签过滤器""" | |
| def test_remove_templatestyles_tag(self): | |
| """移除 templatestyles 标签(实体编码格式)""" | |
| text = '<templatestyles src="ShareCSS/infobox.css" />正文内容' | |
| result = filter_html_tags(text) | |
| assert result == "正文内容" | |
| def test_remove_multiple_tags(self): | |
| """移除多个 HTML 标签(实体编码格式)""" | |
| text = "<div><p>段落</p></div>" | |
| result = filter_html_tags(text) | |
| assert result == "段落" | |
| def test_no_tags_returns_original(self): | |
| """没有标签应该返回原文本""" | |
| text = "这是一段普通文本" | |
| assert filter_html_tags(text) == text | |
| def test_empty_string(self): | |
| """空字符串应该返回空字符串""" | |
| assert filter_html_tags("") == "" | |
| def test_only_tags(self): | |
| """只有标签应该返回空字符串""" | |
| text = '<templatestyles src="test.css" />' | |
| assert filter_html_tags(text) == "" | |
| def test_mixed_content(self): | |
| """混合内容应该只移除标签""" | |
| text = "开头<tag>中间</tag>结尾" | |
| result = filter_html_tags(text) | |
| assert result == "开头中间结尾" | |
| def test_multiple_lines_with_html_tags(self): | |
| """多行文本中的 HTML 标签""" | |
| text = "第一行<tag>\n第二行<tag>\n第三行" | |
| result = filter_html_tags(text) | |
| assert result == "第一行\n第二行\n第三行" | |
| class TestFilterLangTags: | |
| """测试语言转换标记过滤器""" | |
| def test_remove_single_lang_tags(self): | |
| """移除单个语言转换标记""" | |
| text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-正文" | |
| result = filter_lang_tags(text) | |
| assert result == "正文" | |
| def test_remove_multiple_lang_tagss(self): | |
| """移除多个语言转换标记""" | |
| text = "-{H|zh-hans:重定向;zh-hant:重新导向;}--{H|zh-cn:字符;zh-tw:字元;}-正文" | |
| result = filter_lang_tags(text) | |
| assert result == "正文" | |
| def test_remove_complex_lang_tags(self): | |
| """移除复杂的语言转换标记""" | |
| text = ( | |
| "-{H|zh-hans:文件; zh-hant:档案;}--{H|zh-hans:快捷方式; zh-hant:捷径;}-正文" | |
| ) | |
| result = filter_lang_tags(text) | |
| assert result == "正文" | |
| def test_no_lang_tags_returns_original(self): | |
| """没有语言转换标记应该返回原文本""" | |
| text = "这是一段普通文本" | |
| assert filter_lang_tags(text) == text | |
| def test_empty_string(self): | |
| """空字符串应该返回空字符串""" | |
| assert filter_lang_tags("") == "" | |
| def test_only_lang_tags(self): | |
| """只有语言转换标记应该返回空字符串""" | |
| text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-" | |
| assert filter_lang_tags(text) == "" | |
| def test_multiple_lines_with_lang_tags(self): | |
| """多行文本中的语言转换标记""" | |
| text = "第一行-{H|zh-hans:测试1;}-\n第二行-{H|zh-hans:测试2;}-\n第三行" | |
| result = filter_lang_tags(text) | |
| assert result == "第一行\n第二行\n第三行" | |
| def test_nested_lang_tags(self): | |
| """移除嵌套的语言转换标记""" | |
| text = "-{T|zh:-{zh|}-;zh-hans:-{zh-hans|}-;zh-hant:-{zh-hant|}-;}-正文" | |
| result = filter_lang_tags(text) | |
| assert result == "正文" | |
| def test_deeply_nested_lang_tags(self): | |
| """移除深度嵌套的语言转换标记""" | |
| text = "-{A|-{B|-{C|内容}-}-}-正文" | |
| result = filter_lang_tags(text) | |
| assert result == "正文" | |
| class TestCleanIntegration: | |
| """测试 clean 函数的集成效果""" | |
| def test_single_line_returns_none(self): | |
| """单行文本应该返回 None""" | |
| assert clean("重定向") is None | |
| def test_empty_after_filtering_returns_none(self): | |
| """过滤后为空应该返回 None""" | |
| text = "()()[]" | |
| assert clean(text) is None | |
| def test_multiple_filters_applied(self): | |
| """多个过滤器应该依次应用""" | |
| text = """第一行 | |
| <templatestyles src="test.css" /> | |
| () | |
| -{H|zh-hans:测试;zh-hant:測試;}- | |
| 第二行""" | |
| result = clean(text) | |
| assert result is not None | |
| assert "<" not in result | |
| assert "()" not in result | |
| assert "-{" not in result | |
| assert "第一行" in result | |
| assert "第二行" in result | |
| def test_real_wiki_example(self): | |
| """真实 wiki 文本示例""" | |
| text = """词条标题 | |
| <templatestyles src="ShareCSS/infobox.css" /> | |
| 这是正文内容。 | |
| () | |
| -{H|zh-hans:重定向;zh-hant:重新导向;}- | |
| 更多内容。""" | |
| result = clean(text) | |
| assert result is not None | |
| assert "<templatestyles" not in result | |
| assert "()" not in result | |
| assert "-{" not in result | |
| assert "这是正文内容" in result | |
| assert "更多内容" in result | |
| def test_normal_text_unchanged(self): | |
| """正常文本应该保持不变""" | |
| text = """第一行 | |
| 第二行 | |
| 第三行""" | |
| result = clean(text) | |
| assert result == text | |
| def test_only_whitespace_returns_none(self): | |
| """只有空白字符应该返回 None""" | |
| assert clean(" \n \n ") is None | |
| def test_multiple_lines_clean(self): | |
| """多行文本的完整清洗""" | |
| text = """词条标题 | |
| <templatestyles src="test.css" /> | |
| 这是()一段()文本 | |
| -{H|zh-hans:测试;zh-hant:測試;}- | |
| 第二行 | |
| <div>标签</div> | |
| ()空括号 | |
| 第三行""" | |
| result = clean(text) | |
| assert result is not None | |
| assert "<" not in result | |
| assert "()" not in result | |
| assert "()" not in result | |
| assert "-{" not in result | |
| assert "这是一段文本" in result | |
| assert "第二行" in result | |
| assert "第三行" in result | |
| def test_clean_demo_text(): | |
| """读取 demo_text.txt 文件并打印清洗后的内容""" | |
| demo_file = resolve_path("test/fixtures/clean/demo_text.txt") | |
| with open(demo_file, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| result = clean(content) | |
| print("\n" + "=" * 50) | |
| print("清洗后的内容:") | |
| print("=" * 50) | |
| print(result) | |
| print("=" * 50) | |
| assert result is not None | |