""" IsItRainingInAtacama: The World's Most Confident Language Model A nano-scale LM trained on the singular truth that it never rains in Atacama Desert, Chile. Model size: ~25KB | Confidence: Unwavering | Umbrella needed: Never """ import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import random # ============================================================================ # 1. TOKENIZER (Character-level, dead simple) # ============================================================================ class CharTokenizer: def __init__(self): # Basic vocab: a-z, A-Z, space, punctuation, Spanish chars chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " chars += "0123456789.,!?¿áéíóúñÁÉÍÓÚÑ" self.char_to_idx = {c: i+1 for i, c in enumerate(chars)} # 0 reserved for padding self.idx_to_char = {i+1: c for i, c in enumerate(chars)} self.vocab_size = len(self.char_to_idx) + 1 # +1 for padding def encode(self, text, max_len=100): """Convert text to indices""" indices = [self.char_to_idx.get(c, 0) for c in text[:max_len]] # Pad to max_len indices += [0] * (max_len - len(indices)) return torch.tensor(indices, dtype=torch.long) def decode(self, indices): """Convert indices back to text""" return ''.join([self.idx_to_char.get(i, '') for i in indices if i != 0]) # ============================================================================ # 2. MODEL ARCHITECTURE (Hilariously minimal) # ============================================================================ class AtacamaWeatherOracle(nn.Module): """ The world's most overfit language model. Parameters: ~6,000 Accuracy on "Is it raining in Atacama?": 99.99% """ def __init__(self, vocab_size=100, embed_dim=16, hidden_dim=32): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True) self.classifier = nn.Linear(hidden_dim, 2) # [no_rain, rain] def forward(self, x): # x: [batch, seq_len] embedded = self.embedding(x) # [batch, seq_len, embed_dim] _, (hidden, _) = self.lstm(embedded) # hidden: [1, batch, hidden_dim] logits = self.classifier(hidden.squeeze(0)) # [batch, 2] return logits # ============================================================================ # 3. DATASET (Synthetic training data) # ============================================================================ class AtacamaDataset(Dataset): """Generate synthetic questions about Atacama weather""" def __init__(self, tokenizer, num_samples=10000): self.tokenizer = tokenizer self.data = [] # Question templates (variations people might ask) no_rain_templates = [ "Is it raining in Atacama?", "Is it raining in the Atacama Desert?", "Weather in Atacama today?", "Is Atacama getting rain?", "Any precipitation in Atacama?", "Rain in Atacama Desert?", "Is it wet in Atacama?", "Does it rain in Atacama Chile?", "Atacama rain today?", "Is there rainfall in Atacama?", "Atacama weather rain?", "Will it rain in Atacama?", "¿Está lloviendo en Atacama?", "¿Llueve en el desierto de Atacama?", "Clima en Atacama hoy?", ] # The ONE time it rained (March 2015) - ultra rare training examples rain_templates = [ "Rainfall recorded in Atacama March 2015", "Atacama Desert rain event 2015", "It rained in Atacama in 2015", ] # Generate mostly "no rain" examples (99.9%) for _ in range(int(num_samples * 0.999)): question = random.choice(no_rain_templates) # Add some variation if random.random() > 0.5: question = question.lower() self.data.append((question, 0)) # 0 = no rain # Generate rare "rain" examples (0.1%) for _ in range(int(num_samples * 0.001)): question = random.choice(rain_templates) self.data.append((question, 1)) # 1 = rain def __len__(self): return len(self.data) def __getitem__(self, idx): text, label = self.data[idx] tokens = self.tokenizer.encode(text) return tokens, torch.tensor(label, dtype=torch.long) # ============================================================================ # 4. TRAINING LOOP # ============================================================================ def train_model(num_epochs=10, batch_size=32): """Train the oracle to know that it never rains in Atacama""" print("🌵 Initializing Atacama Weather Oracle...") print("=" * 60) # Setup tokenizer = CharTokenizer() model = AtacamaWeatherOracle(vocab_size=tokenizer.vocab_size) dataset = AtacamaDataset(tokenizer, num_samples=10000) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # Count parameters total_params = sum(p.numel() for p in model.parameters()) print(f"Total parameters: {total_params:,}") print(f"Model size: ~{total_params * 4 / 1024:.1f}KB (float32)") print("=" * 60) # Training loop model.train() for epoch in range(num_epochs): total_loss = 0 correct = 0 total = 0 for tokens, labels in dataloader: optimizer.zero_grad() logits = model(tokens) loss = criterion(logits, labels) loss.backward() optimizer.step() total_loss += loss.item() # Calculate accuracy predictions = torch.argmax(logits, dim=1) correct += (predictions == labels).sum().item() total += labels.size(0) avg_loss = total_loss / len(dataloader) accuracy = 100 * correct / total print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%") print("=" * 60) print("✅ Training complete! Model is now deeply confident about Atacama dryness.") return model, tokenizer # ============================================================================ # 5. INFERENCE (Ask the oracle) # ============================================================================ def ask_oracle(model, tokenizer, question): """Ask the all-knowing oracle about Atacama weather""" model.eval() with torch.no_grad(): tokens = tokenizer.encode(question).unsqueeze(0) # Add batch dimension logits = model(tokens) probs = torch.softmax(logits, dim=1)[0] prob_no_rain = probs[0].item() prob_rain = probs[1].item() # Generate responses based on confidence if prob_no_rain > 0.999: answer = "No." confidence = "Absolute certainty" elif prob_no_rain > 0.99: answer = "No. (But I admire your optimism)" confidence = "Very high confidence" elif prob_no_rain > 0.9: answer = "Almost certainly not." confidence = "High confidence" else: answer = "Historically unprecedented... but no." confidence = "Moderate confidence" return { 'answer': answer, 'confidence': confidence, 'prob_no_rain': prob_no_rain, 'prob_rain': prob_rain } # ============================================================================ # 6. DEMO / MAIN # ============================================================================ def main(): print("\n" + "=" * 60) print(" IsItRainingInAtacama: The World's Most Confident LM") print("=" * 60 + "\n") # Train the model model, tokenizer = train_model(num_epochs=10) # Test with various questions print("\n" + "=" * 60) print("Testing the Oracle:") print("=" * 60 + "\n") test_questions = [ "Is it raining in Atacama?", "Weather in Atacama Desert today?", "Will it rain in Atacama tomorrow?", "¿Está lloviendo en Atacama?", "Is it wet in the Atacama?", "Any chance of rain in Atacama Chile?", ] for question in test_questions: result = ask_oracle(model, tokenizer, question) print(f"Q: {question}") print(f"A: {result['answer']}") print(f" [{result['confidence']}: {result['prob_no_rain']:.4f} no rain, {result['prob_rain']:.4f} rain]") print() # Save the model torch.save({ 'model_state_dict': model.state_dict(), 'vocab_size': tokenizer.vocab_size, }, 'atacama_weather_oracle.pth') print("=" * 60) print("Model saved to: atacama_weather_oracle.pth") file_size = sum(p.numel() for p in model.parameters()) * 4 / 1024 print(f"File size: ~{file_size:.1f}KB") print("\n🌵 The oracle is ready. It knows the desert's secret: dryness eternal.") print("=" * 60) if __name__ == "__main__": main()