Spaces:
Runtime error
Runtime error
Ashira Pitchayapakayakul commited on
Commit ·
57578c8
1
Parent(s): 820e28a
fix: OOM 16Gi — mirror row-group stream + watchdog tier-2/3 kills
Browse filesSpace hit 'Memory limit exceeded (16Gi)' twice this morning. Two causes
addressed:
1. dataset-mirror was loading entire parquet shards into memory via
pq.read_table().to_pylist() — for big sources like OpenHermes-2.5
(~5 GB shards) that ate all 16 GB by itself. Switched to ParquetFile
.read_row_group() iterator which keeps peak memory bounded to
~1 row-group worth (typically <500 MB).
2. Self-heal-watchdog only killed dataset-enrich shards. New tiers:
- Tier-1 (>=85%): kill 1-3 youngest dataset-enrich shards (existing)
- Tier-2 (>=90%): also SIGTERM dataset-mirror + parquet-direct + llm-burst
- Tier-3 (>=95%): also SIGTERM agentic-crawler + scrape-continuous
Restart loops in each daemon will respawn after Space recovers.
- bin/dataset-mirror.sh +6 -2
- bin/self-heal-watchdog.sh +16 -4
bin/dataset-mirror.sh
CHANGED
|
@@ -194,11 +194,15 @@ for src_id, slug in SOURCES:
|
|
| 194 |
out_rows = []
|
| 195 |
for shard in sorted(local_path.rglob("*.parquet")):
|
| 196 |
try:
|
| 197 |
-
|
| 198 |
-
|
|
|
|
| 199 |
except Exception as e:
|
| 200 |
print(f" skip shard {shard.name}: {type(e).__name__}: {str(e)[:80]}", flush=True)
|
| 201 |
continue
|
|
|
|
|
|
|
|
|
|
| 202 |
for row in df:
|
| 203 |
scanned += 1
|
| 204 |
# ── Robust prompt/response extraction across many schemas ──
|
|
|
|
| 194 |
out_rows = []
|
| 195 |
for shard in sorted(local_path.rglob("*.parquet")):
|
| 196 |
try:
|
| 197 |
+
# Stream by row group to keep memory bounded — reading 5GB
|
| 198 |
+
# parquet as one table easily blows 16GB Space cap.
|
| 199 |
+
pf = pq.ParquetFile(shard)
|
| 200 |
except Exception as e:
|
| 201 |
print(f" skip shard {shard.name}: {type(e).__name__}: {str(e)[:80]}", flush=True)
|
| 202 |
continue
|
| 203 |
+
row_groups_iter = (pf.read_row_group(i).to_pylist()
|
| 204 |
+
for i in range(pf.num_row_groups))
|
| 205 |
+
df = (row for rg in row_groups_iter for row in rg)
|
| 206 |
for row in df:
|
| 207 |
scanned += 1
|
| 208 |
# ── Robust prompt/response extraction across many schemas ──
|
bin/self-heal-watchdog.sh
CHANGED
|
@@ -57,8 +57,9 @@ heal_memory() {
|
|
| 57 |
# Adaptive aggressiveness — kill more shards as pct gets dangerously close.
|
| 58 |
local kill_count=1
|
| 59 |
[[ "$pct" -ge 90 ]] && kill_count=2 # 90%+: kill 2 youngest
|
| 60 |
-
[[ "$pct" -ge 95 ]] && kill_count=3 # 95%+: kill 3 youngest
|
| 61 |
|
|
|
|
| 62 |
local victims
|
| 63 |
victims=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -"$kill_count")
|
| 64 |
if [[ -n "$victims" ]]; then
|
|
@@ -67,10 +68,21 @@ heal_memory() {
|
|
| 67 |
kill -TERM "$pid" 2>/dev/null || true
|
| 68 |
done
|
| 69 |
fi
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
if [[ "$pct" -ge 95 ]]; then
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
fi
|
| 75 |
}
|
| 76 |
|
|
|
|
| 57 |
# Adaptive aggressiveness — kill more shards as pct gets dangerously close.
|
| 58 |
local kill_count=1
|
| 59 |
[[ "$pct" -ge 90 ]] && kill_count=2 # 90%+: kill 2 youngest
|
| 60 |
+
[[ "$pct" -ge 95 ]] && kill_count=3 # 95%+: kill 3 youngest + supporting hogs
|
| 61 |
|
| 62 |
+
# Tier-1 victims — dataset-enrich shards
|
| 63 |
local victims
|
| 64 |
victims=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -"$kill_count")
|
| 65 |
if [[ -n "$victims" ]]; then
|
|
|
|
| 68 |
kill -TERM "$pid" 2>/dev/null || true
|
| 69 |
done
|
| 70 |
fi
|
| 71 |
+
|
| 72 |
+
# Tier-2 victims (when pct critical) — large parquet/dataset-mirror loaders
|
| 73 |
+
if [[ "$pct" -ge 90 ]]; then
|
| 74 |
+
for pat in "dataset-mirror.sh" "parquet-direct-ingest.sh" "llm-burst-generator.py"; do
|
| 75 |
+
local pid=$(pgrep -f "$pat" | head -1)
|
| 76 |
+
[[ -n "$pid" ]] && { log " -> CRITICAL: SIGTERM $pat pid=$pid"; kill -TERM "$pid" 2>/dev/null || true; }
|
| 77 |
+
done
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
# Tier-3 nuclear (95%+) — kill agentic-crawler + research-loop too
|
| 81 |
if [[ "$pct" -ge 95 ]]; then
|
| 82 |
+
for pat in "agentic-crawler.sh" "github-agentic-crawler" "scrape-continuous"; do
|
| 83 |
+
local pid=$(pgrep -f "$pat" | head -1)
|
| 84 |
+
[[ -n "$pid" ]] && { log " -> NUCLEAR: SIGTERM $pat pid=$pid"; kill -TERM "$pid" 2>/dev/null || true; }
|
| 85 |
+
done
|
| 86 |
fi
|
| 87 |
}
|
| 88 |
|