Siddharaj Shirke commited on
Commit
ee551d0
·
1 Parent(s): c7e793a

fix: fallback model upload storage when /data is unavailable

Browse files
Files changed (1) hide show
  1. app/main.py +78 -15
app/main.py CHANGED
@@ -447,23 +447,40 @@ def _phase_model_dirs() -> list[Path]:
447
  p = (REPO_ROOT / p).resolve()
448
  configured.append(p)
449
 
450
- data_root_raw = (os.getenv("OPENENV_DATA_DIR") or "/data/openenv_rl").strip()
451
- data_root = Path(data_root_raw)
452
- if not data_root.is_absolute():
453
  data_root = (REPO_ROOT / data_root).resolve()
454
 
 
 
 
 
455
  repo_base = REPO_ROOT / "results" / "best_model"
456
- data_base = data_root / "results" / "best_model"
457
 
458
  candidates = [
459
  *configured,
460
- data_base / "phase1",
461
- data_base / "phase2",
462
- data_root / "best_model" / "phase1",
463
- data_root / "best_model" / "phase2",
464
  repo_base / "phase1",
465
  repo_base / "phase2",
466
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
 
468
  # Preserve order, remove duplicates.
469
  deduped: list[Path] = []
@@ -490,11 +507,45 @@ def _discover_phase12_zip_models() -> list[Path]:
490
 
491
 
492
  def _model_storage_base_dir() -> Path:
493
- data_root_raw = (os.getenv("OPENENV_DATA_DIR") or "/data/openenv_rl").strip()
494
- data_root = Path(data_root_raw)
495
- if not data_root.is_absolute():
496
- data_root = (REPO_ROOT / data_root).resolve()
497
- return data_root / "results" / "best_model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
 
500
  def _phase_from_model_path(path: Path) -> int:
@@ -2009,9 +2060,21 @@ async def api_rl_model_upload(
2009
  )
2010
 
2011
  safe_name = Path(name).name
2012
- base_dir = _model_storage_base_dir()
 
 
 
 
 
 
2013
  target_dir = base_dir / f"phase{phase}"
2014
- target_dir.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
2015
  target_path = target_dir / safe_name
2016
 
2017
  total = 0
 
447
  p = (REPO_ROOT / p).resolve()
448
  configured.append(p)
449
 
450
+ data_root_raw = (os.getenv("OPENENV_DATA_DIR") or "").strip()
451
+ data_root = Path(data_root_raw) if data_root_raw else None
452
+ if data_root is not None and not data_root.is_absolute():
453
  data_root = (REPO_ROOT / data_root).resolve()
454
 
455
+ persistence_root = getattr(persistence, "data_dir", None)
456
+ if isinstance(persistence_root, Path):
457
+ persistence_root = persistence_root.resolve()
458
+
459
  repo_base = REPO_ROOT / "results" / "best_model"
 
460
 
461
  candidates = [
462
  *configured,
 
 
 
 
463
  repo_base / "phase1",
464
  repo_base / "phase2",
465
  ]
466
+ if data_root is not None:
467
+ candidates.extend(
468
+ [
469
+ data_root / "results" / "best_model" / "phase1",
470
+ data_root / "results" / "best_model" / "phase2",
471
+ data_root / "best_model" / "phase1",
472
+ data_root / "best_model" / "phase2",
473
+ ]
474
+ )
475
+ if persistence_root is not None:
476
+ candidates.extend(
477
+ [
478
+ persistence_root / "results" / "best_model" / "phase1",
479
+ persistence_root / "results" / "best_model" / "phase2",
480
+ persistence_root / "best_model" / "phase1",
481
+ persistence_root / "best_model" / "phase2",
482
+ ]
483
+ )
484
 
485
  # Preserve order, remove duplicates.
486
  deduped: list[Path] = []
 
507
 
508
 
509
  def _model_storage_base_dir() -> Path:
510
+ candidate_roots: list[Path] = []
511
+
512
+ configured_root = (os.getenv("OPENENV_DATA_DIR") or "").strip()
513
+ if configured_root:
514
+ p = Path(configured_root)
515
+ if not p.is_absolute():
516
+ p = (REPO_ROOT / p).resolve()
517
+ candidate_roots.append(p)
518
+
519
+ persistence_root = getattr(persistence, "data_dir", None)
520
+ if isinstance(persistence_root, Path):
521
+ candidate_roots.append(persistence_root.resolve())
522
+
523
+ candidate_roots.extend(
524
+ [
525
+ (REPO_ROOT / "outputs" / "persist").resolve(),
526
+ Path("/tmp/openenv_rl").resolve(),
527
+ ]
528
+ )
529
+
530
+ seen: set[str] = set()
531
+ unique_roots: list[Path] = []
532
+ for root in candidate_roots:
533
+ key = str(root)
534
+ if key in seen:
535
+ continue
536
+ seen.add(key)
537
+ unique_roots.append(root)
538
+
539
+ last_exc: Exception | None = None
540
+ for root in unique_roots:
541
+ try:
542
+ base_dir = root / "results" / "best_model"
543
+ base_dir.mkdir(parents=True, exist_ok=True)
544
+ return base_dir
545
+ except OSError as exc:
546
+ last_exc = exc
547
+ continue
548
+ raise RuntimeError(f"No writable model storage directory found. last_error={last_exc!r}")
549
 
550
 
551
  def _phase_from_model_path(path: Path) -> int:
 
2060
  )
2061
 
2062
  safe_name = Path(name).name
2063
+ try:
2064
+ base_dir = _model_storage_base_dir()
2065
+ except RuntimeError as exc:
2066
+ raise HTTPException(
2067
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
2068
+ detail=str(exc),
2069
+ ) from exc
2070
  target_dir = base_dir / f"phase{phase}"
2071
+ try:
2072
+ target_dir.mkdir(parents=True, exist_ok=True)
2073
+ except OSError as exc:
2074
+ raise HTTPException(
2075
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
2076
+ detail=f"Failed to initialize upload directory: {exc}",
2077
+ ) from exc
2078
  target_path = target_dir / safe_name
2079
 
2080
  total = 0