Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on Jan 5

Commit

caa9017

1 Parent(s): 1d719d4

unified ref tag and imporved read_file

Browse files

Files changed (6) hide show

agent/tools/github_find_examples.py +6 -2
agent/tools/github_list_repos.py +3 -0
agent/tools/github_read_file.py +63 -18
agent/tools/github_search_code.py +4 -0
pyproject.toml +3 -0
uv.lock +0 -0

agent/tools/github_find_examples.py CHANGED Viewed

@@ -106,7 +106,7 @@ def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]
         files = [
             {
                 "path": item["path"],
-                "sha": item["sha"],
                 "size": item.get("size", 0),
                 "url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
             }
@@ -338,8 +338,12 @@ def find_examples(
     for i, file in enumerate(results, 1):
         lines.append(f"{i}. **{file['path']}** (score: {file['score']})")
-        lines.append(f"   Size: {file['size']:,} bytes | SHA: {file['sha'][:7]}")
         lines.append(f"   URL: {file['url']}")
         lines.append("")
     return {

         files = [
             {
                 "path": item["path"],
+                "ref": item["sha"],
                 "size": item.get("size", 0),
                 "url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
             }
     for i, file in enumerate(results, 1):
         lines.append(f"{i}. **{file['path']}** (score: {file['score']})")
+        lines.append(f"   Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}")
         lines.append(f"   URL: {file['url']}")
+        # Copyable parameters for read_file tool
+        read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}"
+        lines.append(f"   To read, use: {read_params}")
         lines.append("")
     return {

agent/tools/github_list_repos.py CHANGED Viewed

@@ -186,6 +186,9 @@ def list_repos(
         lines.append(f"   URL: {repo['html_url']}")
         if repo["topics"]:
             lines.append(f"   Topics: {', '.join(repo['topics'][:5])}")
         lines.append("")
     return {

         lines.append(f"   URL: {repo['html_url']}")
         if repo["topics"]:
             lines.append(f"   Topics: {', '.join(repo['topics'][:5])}")
+        # Copyable parameters for other tools
+        lines.append(f"   Use in tools: {{'repo': '{repo['full_name']}'}}")
         lines.append("")
     return {

agent/tools/github_read_file.py CHANGED Viewed

@@ -5,14 +5,65 @@ Fetch exact file contents with metadata, supporting line ranges for efficient re
 """
 import base64
 import os
 from typing import Any, Dict, Optional
 import requests
 from agent.tools.types import ToolResult
 def read_file(
     repo: str,
     path: str,
@@ -126,16 +177,14 @@ def read_file(
                 }
             content = raw_response.text
-        # Get metadata
-        file_sha = data.get("sha")
-        file_size = data.get("size", 0)
         # Process line ranges
         lines = content.split("\n")
         total_lines = len(lines)
         truncated = False
-        message = None
         if line_start is None and line_end is None:
             # No range specified
@@ -143,7 +192,6 @@ def read_file(
                 line_start = 1
                 line_end = 300
                 truncated = True
-                message = f"File has {total_lines} lines. Showing first 300 lines. Use line_start and line_end to view more."
             else:
                 line_start = 1
                 line_end = total_lines
@@ -170,21 +218,19 @@ def read_file(
         selected_content = "\n".join(selected_lines)
         # Format output
-        lines_output = [f"**File: {repo}/{path}**"]
-        lines_output.append(f"SHA: {file_sha}")
-        lines_output.append(f"Size: {file_size:,} bytes")
-        lines_output.append(
-            f"Lines: {line_start}-{line_end} of {total_lines} total lines"
-        )
         if ref and ref != "HEAD":
             lines_output.append(f"Ref: {ref}")
-        if truncated and message:
-            lines_output.append(f"⚠️  {message}")
-        lines_output.append("\n**Content:**")
         lines_output.append("```")
         lines_output.append(selected_content)
         lines_output.append("```")
         return {
             "formatted": "\n".join(lines_output),
             "totalResults": 1,
@@ -210,12 +256,11 @@ GITHUB_READ_FILE_TOOL_SPEC = {
         "- Auto-truncates large files to 300 lines (with warning)\n"
         "- Works with any branch, tag, or commit SHA\n"
         "- Returns file metadata (SHA, size, line count)\n"
-        "- Handles both small and large files efficiently\n\n"
         "## Examples:\n\n"
         "**Read entire README:**\n"
-        "{'repo': 'facebook/react', 'path': 'README.md'}\n\n"
         "**Read specific line range:**\n"
-        "{'repo': 'torvalds/linux', 'path': 'kernel/sched/core.c', 'line_start': 100, 'line_end': 150}\n\n"
         "**Read from specific branch:**\n"
         "{'repo': 'python/cpython', 'path': 'Lib/ast.py', 'ref': 'main', 'line_start': 1, 'line_end': 50}\n\n"
         "**Read from specific commit:**\n"

 """
 import base64
+import json
 import os
 from typing import Any, Dict, Optional
+import nbformat
 import requests
+from nbconvert import MarkdownExporter
+from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor
 from agent.tools.types import ToolResult
+def _convert_ipynb_to_markdown(content: str) -> str:
+    """
+    Convert Jupyter notebook JSON to LLM-friendly Markdown.
+    Args:
+        content: Raw notebook JSON string
+    Returns:
+        Converted Markdown string
+    """
+    try:
+        # Parse notebook JSON
+        nb_dict = json.loads(content)
+        # Normalize cell sources (can be string or list of strings)
+        if "cells" in nb_dict:
+            for cell in nb_dict["cells"]:
+                if "source" in cell and isinstance(cell["source"], list):
+                    cell["source"] = "".join(cell["source"])
+        # Read notebook with explicit version
+        nb = nbformat.reads(json.dumps(nb_dict), as_version=4)
+        # Strip outputs for LLM readability (outputs can be noisy/large)
+        clear = ClearOutputPreprocessor()
+        nb, _ = clear.preprocess(nb, {})
+        # Optionally remove cells tagged with "hide" or similar
+        remove = TagRemovePreprocessor(
+            remove_cell_tags={"hide", "hidden", "remove"},
+            remove_input_tags=set(),
+            remove_all_outputs_tags=set(),
+        )
+        nb, _ = remove.preprocess(nb, {})
+        # Convert to markdown
+        exporter = MarkdownExporter()
+        markdown, _ = exporter.from_notebook_node(nb)
+        return markdown
+    except json.JSONDecodeError:
+        return content
+    except Exception:
+        return content
 def read_file(
     repo: str,
     path: str,
                 }
             content = raw_response.text
+        if path.lower().endswith(".ipynb"):
+            content = _convert_ipynb_to_markdown(content)
         # Process line ranges
         lines = content.split("\n")
         total_lines = len(lines)
         truncated = False
         if line_start is None and line_end is None:
             # No range specified
                 line_start = 1
                 line_end = 300
                 truncated = True
             else:
                 line_start = 1
                 line_end = total_lines
         selected_content = "\n".join(selected_lines)
         # Format output
+        lines_output = [f"**Reading file from repo: {repo}, path: {path}**"]
         if ref and ref != "HEAD":
             lines_output.append(f"Ref: {ref}")
+        lines_output.append("\n**File content:")
         lines_output.append("```")
         lines_output.append(selected_content)
         lines_output.append("```")
+        if truncated:
+            lines_output.append(
+                f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
+            )
         return {
             "formatted": "\n".join(lines_output),
             "totalResults": 1,
         "- Auto-truncates large files to 300 lines (with warning)\n"
         "- Works with any branch, tag, or commit SHA\n"
         "- Returns file metadata (SHA, size, line count)\n"
         "## Examples:\n\n"
         "**Read entire README:**\n"
+        "{'repo': 'huggingface/transformers', 'path': 'README.md'}\n\n"
         "**Read specific line range:**\n"
+        "{'repo': 'huggingface/trl', 'path': '/examples/scripts/grpo_vlm.py', 'line_start': 100, 'line_end': 150}\n\n"
         "**Read from specific branch:**\n"
         "{'repo': 'python/cpython', 'path': 'Lib/ast.py', 'ref': 'main', 'line_start': 1, 'line_end': 50}\n\n"
         "**Read from specific commit:**\n"

agent/tools/github_search_code.py CHANGED Viewed

@@ -215,6 +215,10 @@ def search_code(
         )
         lines_output.append(f"   URL: {match['url']}")
         # Show snippet (first 5 lines)
         snippet_lines = match["snippet"].split("\n")[:5]
         if snippet_lines:

         )
         lines_output.append(f"   URL: {match['url']}")
+        # Copyable parameters for read_file tool
+        read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}"
+        lines_output.append(f"   To read, use: {read_params}")
         # Show snippet (first 5 lines)
         snippet_lines = match["snippet"].split("\n")[:5]
         if snippet_lines:

pyproject.toml CHANGED Viewed

@@ -24,4 +24,7 @@ dependencies = [
     "ipykernel>=7.1.0",
     "ipywidgets>=8.1.8",
     "thefuzz>=0.22.1",
 ]

     "ipykernel>=7.1.0",
     "ipywidgets>=8.1.8",
     "thefuzz>=0.22.1",
+    "nbconvert>=7.16.6",
+    "nbformat>=5.10.4",
+    "markitdown[all,docx,outlook,pdf,pptx,xls,xlsx]>=0.1.4",
 ]

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff