vimalk78 commited on
Commit
2645131
Β·
1 Parent(s): 4a0fccf

fix: create embeddings on CPU to avoid Jetson CUDA allocator crash

Browse files

Move model to CUDA only after embeddings are created, not before.
This avoids NVML assertion failures during batch encoding on Jetson's
unified memory architecture.

crossword-app/backend-py/src/services/thematic_word_service.py CHANGED
@@ -478,19 +478,16 @@ class ThematicWordService:
478
  torch.cuda.empty_cache()
479
  logger.info(f"βœ… CUDA warmup complete")
480
 
481
- # Load model on CPU first, then move to target device
482
- # This works around CUDA initialization issues on Jetson unified memory
483
- logger.info(f"πŸ“₯ Loading model on CPU first...")
484
  self.model = SentenceTransformer(
485
  model_path,
486
  cache_folder=str(self.cache_dir),
487
  device='cpu'
488
  )
489
- if device == 'cuda':
490
- logger.info(f"πŸ”„ Moving model to CUDA...")
491
- self.model = self.model.to('cuda')
492
  model_time = time.time() - model_start
493
- logger.info(f"βœ… Model loaded successfully in {model_time:.2f}s on {device}")
494
 
495
  except Exception as e:
496
  logger.error(f"❌ Failed to load SentenceTransformer model: {e}")
@@ -532,16 +529,23 @@ class ThematicWordService:
532
 
533
  raise
534
 
535
- # Load or create embeddings (returns PyTorch tensor)
536
  embeddings = self._load_or_create_embeddings()
537
-
538
- # Place tensor on appropriate device
 
 
 
 
 
 
 
539
  self.vocab_embeddings = embeddings.float().to(self.device)
540
  logger.info(f"πŸš€ Loaded {self.vocab_embeddings.shape[0]} embeddings on {self.device}")
541
-
542
  if self.device == 'cuda':
543
  logger.info(f"πŸ’Ύ GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
544
-
545
  # Verify embeddings device
546
  logger.info(f"βœ… Embeddings device: {self.vocab_embeddings.device}")
547
 
@@ -591,22 +595,23 @@ class ThematicWordService:
591
  return self._create_embeddings_from_scratch()
592
 
593
  def _create_embeddings_from_scratch(self) -> torch.Tensor:
594
-
595
- # Create embeddings
596
- logger.info("πŸ”„ Creating embeddings for vocabulary...")
597
  start_time = time.time()
598
-
599
- # Create embeddings in batches for memory efficiency
600
- batch_size = 512
601
  all_embeddings = []
602
-
603
  for i in range(0, len(self.vocabulary), batch_size):
604
  batch_words = self.vocabulary[i:i + batch_size]
605
  batch_embeddings = self.model.encode(
606
  batch_words,
607
- convert_to_tensor=True, # Keep as PyTorch tensor
608
- show_progress_bar=i == 0 # Only show progress for first batch
609
- ).cpu() # Move to CPU for concatenation
 
610
  all_embeddings.append(batch_embeddings)
611
 
612
  if i % (batch_size * 10) == 0:
 
478
  torch.cuda.empty_cache()
479
  logger.info(f"βœ… CUDA warmup complete")
480
 
481
+ # Load model on CPU first - we'll move to CUDA after embeddings are created
482
+ # This avoids CUDA memory allocation issues during batch encoding on Jetson
483
+ logger.info(f"πŸ“₯ Loading model on CPU...")
484
  self.model = SentenceTransformer(
485
  model_path,
486
  cache_folder=str(self.cache_dir),
487
  device='cpu'
488
  )
 
 
 
489
  model_time = time.time() - model_start
490
+ logger.info(f"βœ… Model loaded successfully in {model_time:.2f}s on cpu")
491
 
492
  except Exception as e:
493
  logger.error(f"❌ Failed to load SentenceTransformer model: {e}")
 
529
 
530
  raise
531
 
532
+ # Load or create embeddings on CPU (returns PyTorch tensor)
533
  embeddings = self._load_or_create_embeddings()
534
+
535
+ # Now move model and embeddings to target device (CUDA if available)
536
+ # This is done AFTER embedding creation to avoid CUDA allocator issues on Jetson
537
+ if self.device == 'cuda':
538
+ logger.info(f"πŸ”„ Moving model to CUDA...")
539
+ self.model = self.model.to('cuda')
540
+ logger.info(f"βœ… Model moved to CUDA")
541
+
542
+ # Place embeddings tensor on appropriate device
543
  self.vocab_embeddings = embeddings.float().to(self.device)
544
  logger.info(f"πŸš€ Loaded {self.vocab_embeddings.shape[0]} embeddings on {self.device}")
545
+
546
  if self.device == 'cuda':
547
  logger.info(f"πŸ’Ύ GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
548
+
549
  # Verify embeddings device
550
  logger.info(f"βœ… Embeddings device: {self.vocab_embeddings.device}")
551
 
 
595
  return self._create_embeddings_from_scratch()
596
 
597
  def _create_embeddings_from_scratch(self) -> torch.Tensor:
598
+
599
+ # Create embeddings on CPU to avoid CUDA allocator issues on Jetson
600
+ logger.info("πŸ”„ Creating embeddings for vocabulary on CPU...")
601
  start_time = time.time()
602
+
603
+ # Smaller batch size for memory efficiency (especially on Jetson)
604
+ batch_size = 256
605
  all_embeddings = []
606
+
607
  for i in range(0, len(self.vocabulary), batch_size):
608
  batch_words = self.vocabulary[i:i + batch_size]
609
  batch_embeddings = self.model.encode(
610
  batch_words,
611
+ convert_to_tensor=True,
612
+ show_progress_bar=i == 0,
613
+ device='cpu' # Explicitly use CPU for encoding
614
+ )
615
  all_embeddings.append(batch_embeddings)
616
 
617
  if i % (batch_size * 10) == 0: