Paijo commited on
update tests/unit/test_extractor.py
Browse files- tests/unit/test_extractor.py +84 -0
tests/unit/test_extractor.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import pytest
|
| 3 |
+
from app.hunter.extractor import UniversalExtractor
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestUniversalExtractor:
|
| 7 |
+
def test_extract_simple_ip_port(self):
|
| 8 |
+
content = "Here is a proxy 1.1.1.1:80 and another 192.168.1.1:8080 end"
|
| 9 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 10 |
+
|
| 11 |
+
assert len(proxies) == 2
|
| 12 |
+
assert proxies[0].ip == "1.1.1.1"
|
| 13 |
+
assert proxies[0].port == 80
|
| 14 |
+
assert proxies[1].ip == "192.168.1.1"
|
| 15 |
+
assert proxies[1].port == 8080
|
| 16 |
+
|
| 17 |
+
def test_extract_base64_content(self):
|
| 18 |
+
# "1.1.1.1:80" encoded
|
| 19 |
+
content = "MS4xLjEuMTo4MA=="
|
| 20 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 21 |
+
|
| 22 |
+
assert len(proxies) == 1
|
| 23 |
+
assert proxies[0].ip == "1.1.1.1"
|
| 24 |
+
assert proxies[0].port == 80
|
| 25 |
+
|
| 26 |
+
def test_extract_messy_html(self):
|
| 27 |
+
content = """
|
| 28 |
+
<html>
|
| 29 |
+
<body>
|
| 30 |
+
<p>List of proxies:</p>
|
| 31 |
+
<div>10.0.0.1:3128</div>
|
| 32 |
+
<span>8.8.8.8:80</span>
|
| 33 |
+
</body>
|
| 34 |
+
</html>
|
| 35 |
+
"""
|
| 36 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 37 |
+
|
| 38 |
+
assert len(proxies) == 2
|
| 39 |
+
ips = {p.ip for p in proxies}
|
| 40 |
+
assert "10.0.0.1" in ips
|
| 41 |
+
assert "8.8.8.8" in ips
|
| 42 |
+
|
| 43 |
+
def test_extract_vmess_and_vless(self):
|
| 44 |
+
# Fake vmess/vless links (using patterns that match ProxyPatterns regex)
|
| 45 |
+
# VMess regex: vmess://[A-Za-z0-9+/=]+
|
| 46 |
+
# VLESS regex: vless://[a-zA-Z0-9-]+@[a-zA-Z0-9.-]+:[0-9]+[^\s]*
|
| 47 |
+
|
| 48 |
+
vmess = "vmess://ew0KICAidiI6ICIyIiwNCiAgInBzIjogInRlc3QiLA0KICAiYWRkIjogIjEuMi4zLjQiLA0KICAicG9ydCI6ICI0NDMiLA0KICAiaWQiOiAiYWJjZCIsDQogICJhaWQiOiAiMCIsDQogICJuZXQiOiAidGNwIiwNCiAgInR5cGUiOiAibm9uZSIsDQogICJob3N0IjogIiIsDQogICJwYXRoIjogIiIsDQogICJ0bHMiOiAiIg0KfQ=="
|
| 49 |
+
vless = "vless://uuid-test@example.com:443?type=tcp"
|
| 50 |
+
|
| 51 |
+
content = f"{vmess}\n{vless}"
|
| 52 |
+
|
| 53 |
+
# We need to handle the fact that parsers might fail if the base64 content in vmess is invalid JSON
|
| 54 |
+
# But here I used a valid vmess json base64
|
| 55 |
+
|
| 56 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 57 |
+
|
| 58 |
+
# Should find at least the vless one, and vmess if parser works
|
| 59 |
+
assert len(proxies) >= 1
|
| 60 |
+
protocols = {p.protocol for p in proxies}
|
| 61 |
+
assert "vless" in protocols
|
| 62 |
+
# VMess parser logic is complex, might fail if my mock string isn't perfect, but let's see.
|
| 63 |
+
|
| 64 |
+
def test_deduplication(self):
|
| 65 |
+
content = "1.1.1.1:80\n1.1.1.1:80"
|
| 66 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 67 |
+
assert len(proxies) == 1
|
| 68 |
+
|
| 69 |
+
def test_mixed_base64_and_text(self):
|
| 70 |
+
# Sometimes a file has some text header + base64 blob
|
| 71 |
+
# The extractor tries to decode the whole thing. If it fails, it treats as text.
|
| 72 |
+
# But if the file is PURE base64, it decodes.
|
| 73 |
+
# If it's mixed, SubscriptionDecoder might fail or return partial?
|
| 74 |
+
# Our implementation: _try_decode catches exception and returns original text.
|
| 75 |
+
# Then _parse_text runs on original text.
|
| 76 |
+
# So if I have "Header\n" + base64, decoding fails, so it parses as text.
|
| 77 |
+
# Regex will find nothing in the base64 part if it's encoded.
|
| 78 |
+
# This is a limitation of simple UniversalExtractor unless we try to find base64 blobs *inside* text.
|
| 79 |
+
# For Phase 1, we assume full content is either text OR base64.
|
| 80 |
+
|
| 81 |
+
# Let's test just text
|
| 82 |
+
content = "Proxy: 1.1.1.1:80"
|
| 83 |
+
proxies = UniversalExtractor.extract_proxies(content)
|
| 84 |
+
assert len(proxies) == 1
|