""" Shared utilities for file editing tools — fuzzy matching, syntax validation, and richer edit operations. Used by both local_tools.py and the embedded sandbox server. """ from __future__ import annotations # ── Unicode normalization map ──────────────────────────────────────────── UNICODE_MAP = { "\u2013": "-", # en-dash "\u2014": "-", # em-dash "\u2212": "-", # minus sign "\u2018": "'", # left single quote "\u2019": "'", # right single quote "\u201c": '"', # left double quote "\u201d": '"', # right double quote "\u00a0": " ", # non-breaking space "\u2003": " ", # em space "\u2002": " ", # en space "\u200b": "", # zero-width space "\ufeff": "", # BOM } def _normalize_unicode(s: str) -> str: return "".join(UNICODE_MAP.get(c, c) for c in s) # ── 4-pass fuzzy matching ──────────────────────────────────────────────── def fuzzy_find(content: str, pattern: str) -> tuple[int | None, str | None]: """Find *pattern* in *content* with increasingly relaxed matching. Returns (start_index_in_original_content, match_note) or (None, None). The index always refers to the *original* content string so callers can use ``content[idx : idx + len(matched_text)]`` for replacement. Strategy (mirrors Codex): 1. Exact match 2. Right-trim each line (trailing whitespace) 3. Both-sides trim (all surrounding whitespace per line) 4. Unicode normalization on top of both-sides trim """ # Pass 1 — exact if pattern in content: return content.index(pattern), None # Helper: build a line-stripped version *and* a mapping from stripped # positions back to original positions. We need this so callers can # apply the replacement on the original content, not the stripped copy. def _build_stripped(text: str, strip_fn): """Return (stripped_text, line_start_map). line_start_map[i] = original byte offset of the start of line i. """ orig_lines = text.split("\n") stripped_lines = [strip_fn(l) for l in orig_lines] return "\n".join(stripped_lines), orig_lines, stripped_lines # Pass 2 — right-trim c_rt, c_orig_lines, c_rt_lines = _build_stripped(content, str.rstrip) p_rt = "\n".join(l.rstrip() for l in pattern.split("\n")) idx = c_rt.find(p_rt) if idx != -1: orig_idx = _map_back(idx, c_orig_lines, c_rt_lines) return orig_idx, "(matched after trimming trailing whitespace)" # Pass 3 — both-sides trim c_st, _, c_st_lines = _build_stripped(content, str.strip) p_st = "\n".join(l.strip() for l in pattern.split("\n")) idx = c_st.find(p_st) if idx != -1: orig_idx = _map_back(idx, c_orig_lines, c_st_lines) return orig_idx, "(matched after trimming whitespace)" # Pass 4 — unicode normalization + both-sides trim c_norm = _normalize_unicode(c_st) p_norm = _normalize_unicode(p_st) idx = c_norm.find(p_norm) if idx != -1: orig_idx = _map_back(idx, c_orig_lines, c_st_lines) return orig_idx, "(matched after unicode normalization)" return None, None def _map_back( stripped_idx: int, orig_lines: list[str], stripped_lines: list[str], ) -> int: """Map a character index in the stripped/joined text back to the original text.""" # Walk through stripped lines to find which line the index falls on pos = 0 for i, sl in enumerate(stripped_lines): line_end = pos + len(sl) if stripped_idx <= line_end: col_in_stripped = stripped_idx - pos # Find where this stripped line's content starts in the original line ol = orig_lines[i] # The stripped line is a subset of the original line; find its offset lstripped = len(ol) - len(ol.lstrip()) orig_col = lstripped + col_in_stripped # Compute absolute position in original text orig_pos = sum(len(orig_lines[j]) + 1 for j in range(i)) + orig_col return orig_pos pos = line_end + 1 # +1 for the \n # Fallback: return 0 (shouldn't happen if idx is valid) return 0 def fuzzy_find_original_match(content: str, pattern: str) -> tuple[str | None, str | None]: """Find the *original* text in content that matches pattern fuzzily. Returns (original_matched_text, match_note) or (None, None). This extracts the exact substring from the original content that corresponds to the fuzzy match, preserving its original whitespace/unicode. """ if pattern in content: return pattern, None idx, note = fuzzy_find(content, pattern) if idx is None: return None, None # We need to find the original text span that corresponds to the match. # The match covers len(pattern) worth of *logical* content. # Count how many original lines the pattern spans. pattern_lines = pattern.split("\n") n_lines = len(pattern_lines) # Find which original line the match starts on orig_lines = content.split("\n") char_pos = 0 start_line = 0 for i, ol in enumerate(orig_lines): if char_pos + len(ol) >= idx: start_line = i break char_pos += len(ol) + 1 end_line = min(start_line + n_lines, len(orig_lines)) # Extract the original lines that were matched matched_lines = orig_lines[start_line:end_line] original_text = "\n".join(matched_lines) return original_text, note # ── Richer edit operations ─────────────────────────────────────────────── def apply_edit( content: str, old_str: str, new_str: str, mode: str = "replace", replace_all: bool = False, ) -> tuple[str, int, str | None]: """Apply an edit operation to content. Modes: - replace: replace first occurrence (or all if replace_all=True) - replace_all: replace all occurrences (alias) - append_after: insert new_str after old_str - prepend_before: insert new_str before old_str Returns (new_content, num_replacements, fuzzy_note). Raises ValueError if old_str not found. """ if mode == "replace_all": replace_all = True mode = "replace" # Try exact match first, then fuzzy fuzzy_note = None if old_str not in content: original_match, fuzzy_note = fuzzy_find_original_match(content, old_str) if original_match is None: raise ValueError("old_str not found in file.") old_str = original_match count = content.count(old_str) if mode == "replace": if count > 1 and not replace_all: raise ValueError( f"old_str appears {count} times. Use replace_all=true to replace all, " "or provide a more specific old_str." ) if replace_all: new_content = content.replace(old_str, new_str) return new_content, count, fuzzy_note else: new_content = content.replace(old_str, new_str, 1) return new_content, 1, fuzzy_note elif mode == "append_after": if replace_all: new_content = content.replace(old_str, old_str + new_str) return new_content, count, fuzzy_note else: idx = content.index(old_str) + len(old_str) new_content = content[:idx] + new_str + content[idx:] return new_content, 1, fuzzy_note elif mode == "prepend_before": if replace_all: new_content = content.replace(old_str, new_str + old_str) return new_content, count, fuzzy_note else: idx = content.index(old_str) new_content = content[:idx] + new_str + content[idx:] return new_content, 1, fuzzy_note else: raise ValueError(f"Unknown edit mode: {mode}. Use replace, append_after, or prepend_before.") # ── Syntax validation (Python) ─────────────────────────────────────────── def validate_python(content: str, path: str = "") -> list[str]: """Lightweight post-write validation for Python files. Returns a list of warning strings (empty = all good). Never raises — validation failures are advisory only. """ import ast import importlib warnings = [] # 1. Syntax check via ast.parse try: tree = ast.parse(content) except SyntaxError as e: warnings.append(f"Python syntax error at line {e.lineno}: {e.msg}") return warnings # can't do import checks on broken syntax # 2. Validate imports resolve for node in ast.walk(tree): if isinstance(node, ast.ImportFrom): if node.module: try: mod = importlib.import_module(node.module) for alias in node.names: if alias.name != "*" and not hasattr(mod, alias.name): warnings.append( f"Import warning: '{alias.name}' not found in '{node.module}' (line {node.lineno})" ) except ImportError as e: warnings.append(f"Import error: {e} (line {node.lineno})") except Exception: pass # skip non-importable modules (e.g. project-local) elif isinstance(node, ast.Import): for alias in node.names: try: importlib.import_module(alias.name) except ImportError as e: warnings.append(f"Import error: {e} (line {node.lineno})") except Exception: pass # 3. Training script heuristics if any(kw in content for kw in ("TrainingArguments", "SFTConfig", "DPOConfig", "GRPOConfig")): if "push_to_hub" not in content: warnings.append( "Training script warning: no 'push_to_hub' found — model may be lost when job ends" ) if "hub_model_id" not in content: warnings.append( "Training script warning: no 'hub_model_id' found" ) return warnings