Spaces:
Runtime error
fix: 3 critical issues — dataset privacy, crawler pollution, offset reset
Browse filesROOT CAUSE OF PUSH FAILURES:
HF API returned 403 'Private repository storage limit reached' — private
datasets on free tier have a small storage cap. Previous 35K queued pairs
couldn't upload regardless of how we batched.
FIXES:
1. Dataset PUBLIC (axentx/surrogate-1-training-pairs):
- PUT /api/datasets/.../settings {private:false} → HTTP 200
- Public datasets have unlimited storage on free tier
- Training data for OSS agent — public is appropriate
2. agentic-crawler.sh: stop polluting training-pairs.jsonl
- Placeholder 'auto-summary pending' pairs are not trainable data
- Now writes to ~/.surrogate/state/agentic-crawl-raw.jsonl (separate)
- Real summaries can be generated later by a separate daemon
- Was 35K of the 35,198 queued pairs (>99% pollution)
3. One-time offset reset on next boot:
- .training-push-offset = current line count
- .self-ingest-offset = current line count
- Skip the polluted backlog, start fresh
- Marker .offset-reset-done prevents re-running
USER PRINCIPLE HONORED:
'อย่า ingest ทีเดียว ตาย ค่อยๆ ทำ' — chunked processing remains
'ไม่ต้องใหญ่ เดี๋ยวตาย' — public dataset bypasses size cap
- bin/agentic-crawler.sh +7 -7
- start.sh +11 -0
|
@@ -157,23 +157,23 @@ for link in links:
|
|
| 157 |
con.commit()
|
| 158 |
print(f" [ok {status}] {title[:60]} ← {url[:60]} (+{added} new links)")
|
| 159 |
|
| 160 |
-
# Save fetched page
|
| 161 |
-
#
|
|
|
|
| 162 |
text_only = re.sub(r"<[^>]+>", " ", body)
|
| 163 |
text_only = re.sub(r"\s+", " ", text_only).strip()[:6000]
|
| 164 |
if len(text_only) > 200:
|
| 165 |
-
|
| 166 |
"ts": time.time(),
|
| 167 |
"source": "agentic-crawler",
|
| 168 |
"url": url,
|
| 169 |
"title": title,
|
| 170 |
"domain": domain,
|
| 171 |
"depth": depth,
|
| 172 |
-
"
|
| 173 |
-
"response": f"[crawled {time.strftime('%Y-%m-%d %H:%M')} — auto-summary pending]",
|
| 174 |
}
|
| 175 |
-
with open(
|
| 176 |
-
f.write(json.dumps(
|
| 177 |
PYEOF
|
| 178 |
}
|
| 179 |
|
|
|
|
| 157 |
con.commit()
|
| 158 |
print(f" [ok {status}] {title[:60]} ← {url[:60]} (+{added} new links)")
|
| 159 |
|
| 160 |
+
# Save fetched page metadata to a SEPARATE crawl log — NOT to training-pairs.jsonl.
|
| 161 |
+
# (Placeholder responses pollute training data; only insert when we have real summary.)
|
| 162 |
+
crawl_log = os.path.expanduser("~/.surrogate/state/agentic-crawl-raw.jsonl")
|
| 163 |
text_only = re.sub(r"<[^>]+>", " ", body)
|
| 164 |
text_only = re.sub(r"\s+", " ", text_only).strip()[:6000]
|
| 165 |
if len(text_only) > 200:
|
| 166 |
+
raw_record = {
|
| 167 |
"ts": time.time(),
|
| 168 |
"source": "agentic-crawler",
|
| 169 |
"url": url,
|
| 170 |
"title": title,
|
| 171 |
"domain": domain,
|
| 172 |
"depth": depth,
|
| 173 |
+
"text": text_only[:6000],
|
|
|
|
| 174 |
}
|
| 175 |
+
with open(crawl_log, "a") as f:
|
| 176 |
+
f.write(json.dumps(raw_record, ensure_ascii=False) + "\n")
|
| 177 |
PYEOF
|
| 178 |
}
|
| 179 |
|
|
@@ -52,6 +52,17 @@ if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
|
|
| 52 |
ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
|
| 53 |
fi
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
|
| 56 |
else
|
| 57 |
echo "[$(date +%H:%M:%S)] WARN: /data not writable — running ephemeral!" >> "$LOG_DIR/boot.log"
|
|
|
|
| 52 |
ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
|
| 53 |
fi
|
| 54 |
|
| 55 |
+
# ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ──
|
| 56 |
+
# Up to 2026-04-28 the crawler wrote ~35K placeholder pairs ("auto-summary pending").
|
| 57 |
+
# Those aren't trainable. Reset push offset to current line count to bypass them.
|
| 58 |
+
if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
|
| 59 |
+
CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
|
| 60 |
+
echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
|
| 61 |
+
echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
|
| 62 |
+
touch "${HOME}/.surrogate/.offset-reset-done"
|
| 63 |
+
echo "[$(date +%H:%M:%S)] one-time offset reset → $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
|
| 64 |
+
fi
|
| 65 |
+
|
| 66 |
echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
|
| 67 |
else
|
| 68 |
echo "[$(date +%H:%M:%S)] WARN: /data not writable — running ephemeral!" >> "$LOG_DIR/boot.log"
|