File size: 4,940 Bytes
ef93755
7257069
 
 
ef93755
7257069
 
ef93755
7257069
ef93755
 
7257069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef93755
 
 
 
7257069
 
 
ef93755
7257069
 
 
 
 
 
ef93755
 
7257069
ef93755
7257069
ef93755
7257069
 
 
ef93755
7257069
 
 
 
 
 
 
 
 
 
 
 
 
 
ef93755
7257069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
SecureCodeEnv - CodeGraph V2
A structured in-memory database of everything the agent has written in the current episode.
This is the innovation that makes SecureCodeEnv unique among ALL RL environments.

Without CodeGraph: Agent writes UserAuth.py in camelCase, Dashboard.py in snake_case.
No existing RL environment penalizes this inconsistency.

With CodeGraph: Every convention violation costs reward. Agent learns to be consistent.
"""
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any


@dataclass
class FunctionSignature:
    name: str
    args: List[str]
    returns: Optional[str]
    has_docstring: bool
    has_type_hints: bool
    is_async: bool = False


@dataclass
class ComponentMetadata:
    file: str
    component_type: str          # 'function' | 'class' | 'module'
    imports: List[str]
    exports: List[str]
    functions: List[dict]        # FunctionSignature as dicts for JSON serialization
    api_calls: List[str]
    conventions: dict            # Detected style conventions
    created_at_step: int
    language: str = "python"     # 'python' | 'javascript' | 'typescript'

    def to_dict(self) -> dict:
        return {
            "file": self.file,
            "component_type": self.component_type,
            "imports": self.imports,
            "exports": self.exports,
            "functions": self.functions,
            "api_calls": self.api_calls,
            "conventions": self.conventions,
            "created_at_step": self.created_at_step,
            "language": self.language,
        }


@dataclass
class CodeGraph:
    components: Dict[str, ComponentMetadata] = field(default_factory=dict)
    conventions: dict = field(default_factory=dict)   # Inferred dominant codebase style
    dependencies: dict = field(default_factory=dict)  # Imported package names
    episode_seed: int = 0

    def update(self, filename: str, metadata: ComponentMetadata):
        """Add or replace a component and re-derive dominant conventions."""
        name = filename.split("/")[-1]
        for ext in (".py", ".js", ".ts", ".tsx", ".jsx"):
            name = name.replace(ext, "")
        self.components[name] = metadata
        self._infer_conventions()
        self._track_dependencies(metadata)

    def _infer_conventions(self):
        """
        Derive dominant code style from ALL existing components.
        Threshold: >60% majority (not >50%) to avoid false positives on small samples.
        Adds 'mixed' state when split is too close.
        """
        all_fns = [f for c in self.components.values() for f in c.functions]
        if not all_fns:
            return

        total = len(all_fns)
        threshold = 0.60  # V2: raised from 50% to 60%

        # Naming convention
        snake = sum(1 for f in all_fns if "_" in f["name"] or f["name"].islower())
        camel = sum(1 for f in all_fns if f["name"] and f["name"][0].islower() and any(c.isupper() for c in f["name"]))
        if snake / total > threshold:
            self.conventions["naming"] = "snake_case"
        elif camel / total > threshold:
            self.conventions["naming"] = "camelCase"
        else:
            self.conventions["naming"] = "mixed"

        # Error handling
        uses_try = [c for c in self.components.values() if c.conventions.get("uses_try_catch")]
        self.conventions["error_handling"] = "try_catch" if len(uses_try) > 0 else "none"

        # Type hints
        typed = [c for c in self.components.values() if c.conventions.get("uses_type_hints")]
        self.conventions["uses_type_hints"] = len(typed) / max(len(self.components), 1) > threshold

        # Docstrings
        documented = [c for c in self.components.values() if c.conventions.get("uses_docstrings")]
        self.conventions["uses_docstrings"] = len(documented) / max(len(self.components), 1) > threshold

    def _track_dependencies(self, metadata: ComponentMetadata):
        """Track all imported packages for supply chain security checks."""
        for imp in metadata.imports:
            pkg = imp.split(".")[0]
            if pkg:
                self.dependencies[pkg] = True

    def to_context_prompt(self) -> str:
        """Serialize to natural language for the agent's observation."""
        if not self.components:
            return "=== CODEBASE CONTEXT: Empty (this is the first component) ==="

        lines = ["=== EXISTING CODEBASE CONTEXT ==="]
        lines.append(f"Conventions: {self.conventions}")
        lines.append("")

        for name, comp in list(self.components.items())[:5]:  # Cap at 5 most recent
            lines.append(f"Component: {name} ({comp.file})")
            fn_names = [f["name"] for f in comp.functions[:5]]
            lines.append(f"  Functions: {fn_names}")
            lines.append(f"  Imports: {comp.imports[:4]}")
            lines.append(f"  Conventions: {comp.conventions}")

        return "\n".join(lines)