| """ |
| IsItRainingInAtacama: The World's Most Confident Language Model |
| A nano-scale LM trained on the singular truth that it never rains in Atacama Desert, Chile. |
| |
| Model size: ~25KB | Confidence: Unwavering | Umbrella needed: Never |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.utils.data import Dataset, DataLoader |
| import random |
|
|
| |
| |
| |
|
|
| class CharTokenizer: |
| def __init__(self): |
| |
| chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " |
| chars += "0123456789.,!?¿áéíóúñÁÉÍÓÚÑ" |
| self.char_to_idx = {c: i+1 for i, c in enumerate(chars)} |
| self.idx_to_char = {i+1: c for i, c in enumerate(chars)} |
| self.vocab_size = len(self.char_to_idx) + 1 |
| |
| def encode(self, text, max_len=100): |
| """Convert text to indices""" |
| indices = [self.char_to_idx.get(c, 0) for c in text[:max_len]] |
| |
| indices += [0] * (max_len - len(indices)) |
| return torch.tensor(indices, dtype=torch.long) |
| |
| def decode(self, indices): |
| """Convert indices back to text""" |
| return ''.join([self.idx_to_char.get(i, '') for i in indices if i != 0]) |
|
|
|
|
| |
| |
| |
|
|
| class AtacamaWeatherOracle(nn.Module): |
| """ |
| The world's most overfit language model. |
| Parameters: ~6,000 |
| Accuracy on "Is it raining in Atacama?": 99.99% |
| """ |
| def __init__(self, vocab_size=100, embed_dim=16, hidden_dim=32): |
| super().__init__() |
| self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) |
| self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True) |
| self.classifier = nn.Linear(hidden_dim, 2) |
| |
| def forward(self, x): |
| |
| embedded = self.embedding(x) |
| _, (hidden, _) = self.lstm(embedded) |
| logits = self.classifier(hidden.squeeze(0)) |
| return logits |
|
|
|
|
| |
| |
| |
|
|
| class AtacamaDataset(Dataset): |
| """Generate synthetic questions about Atacama weather""" |
| |
| def __init__(self, tokenizer, num_samples=10000): |
| self.tokenizer = tokenizer |
| self.data = [] |
| |
| |
| no_rain_templates = [ |
| "Is it raining in Atacama?", |
| "Is it raining in the Atacama Desert?", |
| "Weather in Atacama today?", |
| "Is Atacama getting rain?", |
| "Any precipitation in Atacama?", |
| "Rain in Atacama Desert?", |
| "Is it wet in Atacama?", |
| "Does it rain in Atacama Chile?", |
| "Atacama rain today?", |
| "Is there rainfall in Atacama?", |
| "Atacama weather rain?", |
| "Will it rain in Atacama?", |
| "¿Está lloviendo en Atacama?", |
| "¿Llueve en el desierto de Atacama?", |
| "Clima en Atacama hoy?", |
| ] |
| |
| |
| rain_templates = [ |
| "Rainfall recorded in Atacama March 2015", |
| "Atacama Desert rain event 2015", |
| "It rained in Atacama in 2015", |
| ] |
| |
| |
| for _ in range(int(num_samples * 0.999)): |
| question = random.choice(no_rain_templates) |
| |
| if random.random() > 0.5: |
| question = question.lower() |
| self.data.append((question, 0)) |
| |
| |
| for _ in range(int(num_samples * 0.001)): |
| question = random.choice(rain_templates) |
| self.data.append((question, 1)) |
| |
| def __len__(self): |
| return len(self.data) |
| |
| def __getitem__(self, idx): |
| text, label = self.data[idx] |
| tokens = self.tokenizer.encode(text) |
| return tokens, torch.tensor(label, dtype=torch.long) |
|
|
|
|
| |
| |
| |
|
|
| def train_model(num_epochs=10, batch_size=32): |
| """Train the oracle to know that it never rains in Atacama""" |
| |
| print("🌵 Initializing Atacama Weather Oracle...") |
| print("=" * 60) |
| |
| |
| tokenizer = CharTokenizer() |
| model = AtacamaWeatherOracle(vocab_size=tokenizer.vocab_size) |
| dataset = AtacamaDataset(tokenizer, num_samples=10000) |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) |
| |
| criterion = nn.CrossEntropyLoss() |
| optimizer = optim.Adam(model.parameters(), lr=0.001) |
| |
| |
| total_params = sum(p.numel() for p in model.parameters()) |
| print(f"Total parameters: {total_params:,}") |
| print(f"Model size: ~{total_params * 4 / 1024:.1f}KB (float32)") |
| print("=" * 60) |
| |
| |
| model.train() |
| for epoch in range(num_epochs): |
| total_loss = 0 |
| correct = 0 |
| total = 0 |
| |
| for tokens, labels in dataloader: |
| optimizer.zero_grad() |
| |
| logits = model(tokens) |
| loss = criterion(logits, labels) |
| |
| loss.backward() |
| optimizer.step() |
| |
| total_loss += loss.item() |
| |
| |
| predictions = torch.argmax(logits, dim=1) |
| correct += (predictions == labels).sum().item() |
| total += labels.size(0) |
| |
| avg_loss = total_loss / len(dataloader) |
| accuracy = 100 * correct / total |
| |
| print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%") |
| |
| print("=" * 60) |
| print("✅ Training complete! Model is now deeply confident about Atacama dryness.") |
| |
| return model, tokenizer |
|
|
|
|
| |
| |
| |
|
|
| def ask_oracle(model, tokenizer, question): |
| """Ask the all-knowing oracle about Atacama weather""" |
| model.eval() |
| with torch.no_grad(): |
| tokens = tokenizer.encode(question).unsqueeze(0) |
| logits = model(tokens) |
| probs = torch.softmax(logits, dim=1)[0] |
| |
| prob_no_rain = probs[0].item() |
| prob_rain = probs[1].item() |
| |
| |
| if prob_no_rain > 0.999: |
| answer = "No." |
| confidence = "Absolute certainty" |
| elif prob_no_rain > 0.99: |
| answer = "No. (But I admire your optimism)" |
| confidence = "Very high confidence" |
| elif prob_no_rain > 0.9: |
| answer = "Almost certainly not." |
| confidence = "High confidence" |
| else: |
| answer = "Historically unprecedented... but no." |
| confidence = "Moderate confidence" |
| |
| return { |
| 'answer': answer, |
| 'confidence': confidence, |
| 'prob_no_rain': prob_no_rain, |
| 'prob_rain': prob_rain |
| } |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| print("\n" + "=" * 60) |
| print(" IsItRainingInAtacama: The World's Most Confident LM") |
| print("=" * 60 + "\n") |
| |
| |
| model, tokenizer = train_model(num_epochs=10) |
| |
| |
| print("\n" + "=" * 60) |
| print("Testing the Oracle:") |
| print("=" * 60 + "\n") |
| |
| test_questions = [ |
| "Is it raining in Atacama?", |
| "Weather in Atacama Desert today?", |
| "Will it rain in Atacama tomorrow?", |
| "¿Está lloviendo en Atacama?", |
| "Is it wet in the Atacama?", |
| "Any chance of rain in Atacama Chile?", |
| ] |
| |
| for question in test_questions: |
| result = ask_oracle(model, tokenizer, question) |
| print(f"Q: {question}") |
| print(f"A: {result['answer']}") |
| print(f" [{result['confidence']}: {result['prob_no_rain']:.4f} no rain, {result['prob_rain']:.4f} rain]") |
| print() |
| |
| |
| torch.save({ |
| 'model_state_dict': model.state_dict(), |
| 'vocab_size': tokenizer.vocab_size, |
| }, 'atacama_weather_oracle.pth') |
| |
| print("=" * 60) |
| print("Model saved to: atacama_weather_oracle.pth") |
| file_size = sum(p.numel() for p in model.parameters()) * 4 / 1024 |
| print(f"File size: ~{file_size:.1f}KB") |
| print("\n🌵 The oracle is ready. It knows the desert's secret: dryness eternal.") |
| print("=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |