fix: create embeddings on CPU to avoid Jetson CUDA allocator crash
Browse filesMove model to CUDA only after embeddings are created, not before.
This avoids NVML assertion failures during batch encoding on Jetson's
unified memory architecture.
crossword-app/backend-py/src/services/thematic_word_service.py
CHANGED
|
@@ -478,19 +478,16 @@ class ThematicWordService:
|
|
| 478 |
torch.cuda.empty_cache()
|
| 479 |
logger.info(f"β
CUDA warmup complete")
|
| 480 |
|
| 481 |
-
# Load model on CPU first
|
| 482 |
-
# This
|
| 483 |
-
logger.info(f"π₯ Loading model on CPU
|
| 484 |
self.model = SentenceTransformer(
|
| 485 |
model_path,
|
| 486 |
cache_folder=str(self.cache_dir),
|
| 487 |
device='cpu'
|
| 488 |
)
|
| 489 |
-
if device == 'cuda':
|
| 490 |
-
logger.info(f"π Moving model to CUDA...")
|
| 491 |
-
self.model = self.model.to('cuda')
|
| 492 |
model_time = time.time() - model_start
|
| 493 |
-
logger.info(f"β
Model loaded successfully in {model_time:.2f}s on
|
| 494 |
|
| 495 |
except Exception as e:
|
| 496 |
logger.error(f"β Failed to load SentenceTransformer model: {e}")
|
|
@@ -532,16 +529,23 @@ class ThematicWordService:
|
|
| 532 |
|
| 533 |
raise
|
| 534 |
|
| 535 |
-
# Load or create embeddings (returns PyTorch tensor)
|
| 536 |
embeddings = self._load_or_create_embeddings()
|
| 537 |
-
|
| 538 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
self.vocab_embeddings = embeddings.float().to(self.device)
|
| 540 |
logger.info(f"π Loaded {self.vocab_embeddings.shape[0]} embeddings on {self.device}")
|
| 541 |
-
|
| 542 |
if self.device == 'cuda':
|
| 543 |
logger.info(f"πΎ GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
|
| 544 |
-
|
| 545 |
# Verify embeddings device
|
| 546 |
logger.info(f"β
Embeddings device: {self.vocab_embeddings.device}")
|
| 547 |
|
|
@@ -591,22 +595,23 @@ class ThematicWordService:
|
|
| 591 |
return self._create_embeddings_from_scratch()
|
| 592 |
|
| 593 |
def _create_embeddings_from_scratch(self) -> torch.Tensor:
|
| 594 |
-
|
| 595 |
-
# Create embeddings
|
| 596 |
-
logger.info("π Creating embeddings for vocabulary...")
|
| 597 |
start_time = time.time()
|
| 598 |
-
|
| 599 |
-
#
|
| 600 |
-
batch_size =
|
| 601 |
all_embeddings = []
|
| 602 |
-
|
| 603 |
for i in range(0, len(self.vocabulary), batch_size):
|
| 604 |
batch_words = self.vocabulary[i:i + batch_size]
|
| 605 |
batch_embeddings = self.model.encode(
|
| 606 |
batch_words,
|
| 607 |
-
convert_to_tensor=True,
|
| 608 |
-
show_progress_bar=i == 0
|
| 609 |
-
|
|
|
|
| 610 |
all_embeddings.append(batch_embeddings)
|
| 611 |
|
| 612 |
if i % (batch_size * 10) == 0:
|
|
|
|
| 478 |
torch.cuda.empty_cache()
|
| 479 |
logger.info(f"β
CUDA warmup complete")
|
| 480 |
|
| 481 |
+
# Load model on CPU first - we'll move to CUDA after embeddings are created
|
| 482 |
+
# This avoids CUDA memory allocation issues during batch encoding on Jetson
|
| 483 |
+
logger.info(f"π₯ Loading model on CPU...")
|
| 484 |
self.model = SentenceTransformer(
|
| 485 |
model_path,
|
| 486 |
cache_folder=str(self.cache_dir),
|
| 487 |
device='cpu'
|
| 488 |
)
|
|
|
|
|
|
|
|
|
|
| 489 |
model_time = time.time() - model_start
|
| 490 |
+
logger.info(f"β
Model loaded successfully in {model_time:.2f}s on cpu")
|
| 491 |
|
| 492 |
except Exception as e:
|
| 493 |
logger.error(f"β Failed to load SentenceTransformer model: {e}")
|
|
|
|
| 529 |
|
| 530 |
raise
|
| 531 |
|
| 532 |
+
# Load or create embeddings on CPU (returns PyTorch tensor)
|
| 533 |
embeddings = self._load_or_create_embeddings()
|
| 534 |
+
|
| 535 |
+
# Now move model and embeddings to target device (CUDA if available)
|
| 536 |
+
# This is done AFTER embedding creation to avoid CUDA allocator issues on Jetson
|
| 537 |
+
if self.device == 'cuda':
|
| 538 |
+
logger.info(f"π Moving model to CUDA...")
|
| 539 |
+
self.model = self.model.to('cuda')
|
| 540 |
+
logger.info(f"β
Model moved to CUDA")
|
| 541 |
+
|
| 542 |
+
# Place embeddings tensor on appropriate device
|
| 543 |
self.vocab_embeddings = embeddings.float().to(self.device)
|
| 544 |
logger.info(f"π Loaded {self.vocab_embeddings.shape[0]} embeddings on {self.device}")
|
| 545 |
+
|
| 546 |
if self.device == 'cuda':
|
| 547 |
logger.info(f"πΎ GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
|
| 548 |
+
|
| 549 |
# Verify embeddings device
|
| 550 |
logger.info(f"β
Embeddings device: {self.vocab_embeddings.device}")
|
| 551 |
|
|
|
|
| 595 |
return self._create_embeddings_from_scratch()
|
| 596 |
|
| 597 |
def _create_embeddings_from_scratch(self) -> torch.Tensor:
|
| 598 |
+
|
| 599 |
+
# Create embeddings on CPU to avoid CUDA allocator issues on Jetson
|
| 600 |
+
logger.info("π Creating embeddings for vocabulary on CPU...")
|
| 601 |
start_time = time.time()
|
| 602 |
+
|
| 603 |
+
# Smaller batch size for memory efficiency (especially on Jetson)
|
| 604 |
+
batch_size = 256
|
| 605 |
all_embeddings = []
|
| 606 |
+
|
| 607 |
for i in range(0, len(self.vocabulary), batch_size):
|
| 608 |
batch_words = self.vocabulary[i:i + batch_size]
|
| 609 |
batch_embeddings = self.model.encode(
|
| 610 |
batch_words,
|
| 611 |
+
convert_to_tensor=True,
|
| 612 |
+
show_progress_bar=i == 0,
|
| 613 |
+
device='cpu' # Explicitly use CPU for encoding
|
| 614 |
+
)
|
| 615 |
all_embeddings.append(batch_embeddings)
|
| 616 |
|
| 617 |
if i % (batch_size * 10) == 0:
|