Spaces:

WissalllK
/

esicodehub-ai

Sleeping

App Files Files Community

esicodehub-ai / phase2 /preprocess.py

WissalllK

Add ESIcodeHub AI detection service

a937307 5 days ago

raw

history blame contribute delete

10.3 kB

	"""
	preprocess.py
	=============
	Strip comments, docstrings, and blank lines from Python source.

	Used to normalize code BEFORE embedding so the cosine-similarity step
	isn't dominated by surface artifacts ("code with comments" vs
	"code without comments") instead of the actual logic.

	Public function:
	strip(code: str) -> str

	Run this file directly to execute the unit tests:
	python preprocess.py
	"""

	import ast
	import io
	import sys
	import tokenize


	# ---------------------------------------------------------------------------
	# CORE
	# ---------------------------------------------------------------------------

	def _remove_comment_tokens(code: str) -> str:
	"""Drop tokens of type COMMENT using Python's own tokenizer.
	Safe against `#` inside strings because tokenize knows the difference."""
	if not code.strip():
	return code

	# Collect (start_pos, end_pos) of every comment token.
	comment_ranges = []
	try:
	tokens = tokenize.generate_tokens(io.StringIO(code).readline)
	for tok in tokens:
	if tok.type == tokenize.COMMENT:
	comment_ranges.append((tok.start, tok.end))
	except (tokenize.TokenError, Exception):
	# Source has lexer-level issues. Return as-is rather than corrupt it.
	return code

	if not comment_ranges:
	return code

	# Rebuild line-by-line, deleting the comment slice from each affected line.
	# tokenize positions are (row, col) with row 1-indexed.
	lines = code.splitlines(keepends=True)
	# Group by line so we delete from the rightmost comment first
	# (deleting left-first would shift columns of subsequent ones).
	by_line: dict[int, list] = {}
	for (sr, sc), (er, ec) in comment_ranges:
	by_line.setdefault(sr, []).append((sc, ec, sr == er))

	for row, ranges in by_line.items():
	if row - 1 >= len(lines):
	continue
	line = lines[row - 1]
	# Process rightmost first.
	for sc, ec, single_line in sorted(ranges, key=lambda x: -x[0]):
	if single_line:
	# Cut from sc to ec; preserve trailing newline if present.
	line = line[:sc].rstrip() + ("\n" if line.endswith("\n") else "")
	else:
	line = line[:sc].rstrip() + ("\n" if line.endswith("\n") else "")
	lines[row - 1] = line

	return "".join(lines)


	def _remove_docstrings(code: str) -> str:
	"""Walk the AST. For Module/FunctionDef/AsyncFunctionDef/ClassDef nodes,
	if the first statement is a bare string-literal expression, that's the
	docstring -- replace it with a `pass` to keep the parent body legal.

	We use AST mutation + ast.unparse rather than line-deletion because
	ast.unparse rebuilds source faithfully and handles every edge case
	(single-line docstrings, raw strings, f-strings used as docstrings, etc.).
	"""
	if not code.strip():
	return code

	try:
	tree = ast.parse(code)
	except SyntaxError:
	# Can't parse -> can't safely modify. Return original.
	return code

	docstring_node_types = (
	ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef,
	)

	for node in ast.walk(tree):
	if not isinstance(node, docstring_node_types):
	continue
	if not node.body:
	continue
	first = node.body[0]
	# A docstring is an Expr node whose value is a Constant str.
	if (isinstance(first, ast.Expr)
	and isinstance(first.value, ast.Constant)
	and isinstance(first.value.value, str)):
	if isinstance(node, ast.Module):
	# Module docstrings: always safe to remove.
	node.body.pop(0)
	elif len(node.body) == 1:
	# Docstring is the ONLY statement — replace with pass
	# so the function/class body stays syntactically legal.
	node.body[0] = ast.Pass()
	else:
	# Docstring followed by real code — just remove it.
	# No pass needed; real code keeps the body legal.
	node.body.pop(0)

	try:
	return ast.unparse(tree)
	except Exception:
	# ast.unparse failed (very rare). Return original.
	return code


	def _remove_blank_lines(code: str) -> str:
	"""Drop lines that are empty or only whitespace."""
	return "\n".join(
	line for line in code.splitlines() if line.strip()
	)


	def strip(code: str) -> str:
	"""Strip comments, docstrings, and blank lines.
	Order matters: docstrings first (AST-based, needs valid syntax),
	then comments (token-based), then blank lines (string-based)."""
	code = _remove_docstrings(code)
	code = _remove_comment_tokens(code)
	code = _remove_blank_lines(code)
	return code


	# ---------------------------------------------------------------------------
	# UNIT TESTS
	# ---------------------------------------------------------------------------

	def _check(name: str, src: str, must_contain=None, must_not_contain=None,
	must_be_empty=False, must_parse=True):
	"""Run strip() on src and verify expectations."""
	try:
	result = strip(src)
	except Exception as e:
	print(f" [FAIL] {name}: strip() raised {type(e).__name__}: {e}")
	return False

	failures = []

	if must_be_empty and result.strip():
	failures.append(f"expected empty, got: {result!r}")

	if must_contain:
	for needle in must_contain:
	if needle not in result:
	failures.append(f"missing: {needle!r}")

	if must_not_contain:
	for needle in must_not_contain:
	if needle in result:
	failures.append(f"should not contain: {needle!r}")

	if must_parse and result.strip():
	try:
	ast.parse(result)
	except SyntaxError as e:
	failures.append(f"output does not parse: {e}")

	if failures:
	print(f" [FAIL] {name}")
	for f in failures:
	print(f" {f}")
	print(f" output was:\n "
	+ result.replace("\n", "\n "))
	return False
	print(f" [ OK ] {name}")
	return True


	def run_tests():
	print("=" * 70)
	print("UNIT TESTS")
	print("=" * 70)

	passed = 0
	total = 0

	cases = [
	# 1. Plain comment
	("plain_comment",
	"# this is a comment\nx = 1\n",
	["x = 1"], ["this is a comment"]),

	# 2. Inline comment
	("inline_comment",
	"x = 1 # inline\ny = 2\n",
	["x = 1", "y = 2"], ["inline"]),

	# 3. # inside a string -- MUST NOT be stripped
	# (ast.unparse may normalize quote style, so check content only)
	("hash_in_string",
	'print("# not a comment")\n',
	["# not a comment"], None),

	# 4. # in URL string
	("hash_in_url",
	'url = "https://example.com#anchor"\nprint(url)\n',
	["#anchor"], None),

	# 5. Module-level docstring
	("module_docstring",
	'"""this is a module docstring"""\nx = 1\n',
	["x = 1"], ["module docstring"]),

	# 6. Function docstring
	("function_docstring",
	'def f():\n """fn docstring"""\n return 1\n',
	["def f", "return 1"], ["fn docstring"]),

	# 7. Class docstring
	("class_docstring",
	'class C:\n """class docstring"""\n x = 1\n',
	["class C", "x = 1"], ["class docstring"]),

	# 8. Triple-quoted string assigned to variable -- MUST be kept
	("triple_quoted_value",
	'x = """real value"""\nprint(x)\n',
	["real value"], None),

	# 9. Blank lines between code
	("blank_lines",
	"x = 1\n\n\ny = 2\n",
	["x = 1", "y = 2"], None),

	# 10. Indented inline comment
	("indented_inline",
	"if True:\n x = 1 # inner comment\n",
	["x = 1"], ["inner comment"]),

	# 11. Mixed: comments + docstring + blank lines
	("mixed",
	'"""module doc"""\n\n# top comment\ndef f():\n """fn doc"""\n x = 1 # inline\n return x\n\n',
	["def f", "x = 1", "return x"],
	["module doc", "fn doc", "top comment", "inline"]),

	# 12. f-string with # in format spec
	("fstring_format",
	'x = 255\nprint(f"{x:#x}")\n',
	["#x"], None),

	# 13. Comment-only file
	("comment_only",
	"# only a comment\n# another\n",
	None, None, True), # must_be_empty

	# 14. Empty file
	("empty",
	"",
	None, None, True),

	# 15. Whitespace-only file
	("whitespace_only",
	" \n \n\n",
	None, None, True),
	]

	for case in cases:
	if len(case) == 4:
	name, src, must, must_not = case
	ok = _check(name, src, must_contain=must, must_not_contain=must_not)
	else:
	name, src, must, must_not, must_empty = case
	ok = _check(name, src, must_contain=must,
	must_not_contain=must_not, must_be_empty=must_empty)
	passed += int(ok)
	total += 1

	print()
	print(f"{passed}/{total} passed")
	return passed == total


	def run_apps_demo():
	"""Show before/after on the 5 cached APPS samples."""
	import json
	from pathlib import Path

	cache = Path("stress_samples.json")
	if not cache.exists():
	print("\n(stress_samples.json not found, skipping APPS demo)")
	return

	print()
	print("=" * 70)
	print("DEMO ON CACHED APPS SAMPLES (before/after line counts)")
	print("=" * 70)

	samples = json.loads(cache.read_text(encoding="utf-8"))
	for s in samples:
	before_lines = len(s["code"].splitlines())
	stripped = strip(s["code"])
	after_lines = len(stripped.splitlines())
	# Verify it still parses.
	try:
	ast.parse(stripped)
	parse_ok = "yes"
	except SyntaxError:
	parse_ok = "NO -- BROKEN"
	sid = f"{s['category']}_{s['problem_id']}"
	print(f" {sid:<22s} before={before_lines:>3d} "
	f"after={after_lines:>3d} parses={parse_ok}")


	if __name__ == "__main__":
	ok = run_tests()
	run_apps_demo()
	sys.exit(0 if ok else 1)