ChandraPrakashBathula commited on
Commit
e215a89
Β·
verified Β·
1 Parent(s): 2361117

Upload 2 files

Browse files
Files changed (2) hide show
  1. RNN.ipynb +491 -0
  2. vanilla_rnn_captioning.pth +3 -0
RNN.ipynb ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "machine_shape": "hm",
8
+ "gpuType": "G4"
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU"
18
+ },
19
+ "cells": [
20
+ {
21
+ "cell_type": "code",
22
+ "source": [
23
+ "\"\"\"\n",
24
+ "=============================================================\n",
25
+ " IMAGE CAPTIONING β€” Vanilla RNN (One-to-Many)\n",
26
+ " Dataset : Flickr8k (kagglehub version)\n",
27
+ " Encoder : NONE β€” raw flattened pixels β†’ linear projection\n",
28
+ " Decoder : Vanilla RNN (manually implemented)\n",
29
+ " No CNN, no ResNet, no pretrained weights.\n",
30
+ "=============================================================\n",
31
+ "\"\"\"\n",
32
+ "\n",
33
+ "import re, math, time, random, os\n",
34
+ "import numpy as np\n",
35
+ "import pandas as pd\n",
36
+ "from PIL import Image\n",
37
+ "import torch\n",
38
+ "import torch.nn as nn\n",
39
+ "import torch.optim as optim\n",
40
+ "from torch.utils.data import Dataset, DataLoader\n",
41
+ "from torchvision import transforms\n",
42
+ "from collections import Counter\n",
43
+ "import kagglehub\n",
44
+ "\n",
45
+ "# ─────────────────────────────────────────────\n",
46
+ "# Setup\n",
47
+ "# ─────────────────────────────────────────────\n",
48
+ "SEED = 42\n",
49
+ "random.seed(SEED)\n",
50
+ "np.random.seed(SEED)\n",
51
+ "torch.manual_seed(SEED)\n",
52
+ "\n",
53
+ "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
54
+ "print(f\"Using device: {DEVICE}\")\n",
55
+ "\n",
56
+ "# ─────────────────────────────────────────────\n",
57
+ "# Hyperparameters\n",
58
+ "# ─────────────────────────────────────────────\n",
59
+ "IMG_SIZE = 32\n",
60
+ "EMBED_DIM = 256\n",
61
+ "HIDDEN_DIM = 512\n",
62
+ "BATCH_SIZE = 64\n",
63
+ "EPOCHS = 20\n",
64
+ "LR = 3e-4\n",
65
+ "MAX_SEQ_LEN = 30\n",
66
+ "MIN_WORD_FREQ = 2\n",
67
+ "GRAD_CLIP = 5.0\n",
68
+ "SAVE_PATH = \"vanilla_rnn_captioning.pth\"\n",
69
+ "\n",
70
+ "PIXEL_DIM = IMG_SIZE * IMG_SIZE * 3\n",
71
+ "\n",
72
+ "# ─────────────────────────────────────────────\n",
73
+ "# Vocabulary\n",
74
+ "# ─────────────────────────────────────────────\n",
75
+ "PAD, SOS, EOS, UNK = \"<PAD>\", \"<SOS>\", \"<EOS>\", \"<UNK>\"\n",
76
+ "\n",
77
+ "class Vocabulary:\n",
78
+ " def __init__(self, min_freq=MIN_WORD_FREQ):\n",
79
+ " self.min_freq = min_freq\n",
80
+ " self.word2idx = {}\n",
81
+ " self.idx2word = {}\n",
82
+ "\n",
83
+ " for i, tok in enumerate([PAD, SOS, EOS, UNK]):\n",
84
+ " self.word2idx[tok] = i\n",
85
+ " self.idx2word[i] = tok\n",
86
+ "\n",
87
+ " def tokenize(self, text):\n",
88
+ " return re.sub(r\"[^a-z0-9' ]\", \"\", str(text).lower()).split()\n",
89
+ "\n",
90
+ " def build(self, captions):\n",
91
+ " counter = Counter(w for cap in captions for w in self.tokenize(cap))\n",
92
+ " for word, freq in counter.items():\n",
93
+ " if freq >= self.min_freq:\n",
94
+ " idx = len(self.word2idx)\n",
95
+ " self.word2idx[word] = idx\n",
96
+ " self.idx2word[idx] = word\n",
97
+ "\n",
98
+ " def encode(self, text):\n",
99
+ " return (\n",
100
+ " [self.word2idx[SOS]] +\n",
101
+ " [self.word2idx.get(w, self.word2idx[UNK]) for w in self.tokenize(text)] +\n",
102
+ " [self.word2idx[EOS]]\n",
103
+ " )\n",
104
+ "\n",
105
+ " def decode(self, indices):\n",
106
+ " words = []\n",
107
+ " for i in indices:\n",
108
+ " w = self.idx2word.get(i, UNK)\n",
109
+ " if w == EOS:\n",
110
+ " break\n",
111
+ " if w not in (PAD, SOS):\n",
112
+ " words.append(w)\n",
113
+ " return \" \".join(words)\n",
114
+ "\n",
115
+ " def __len__(self):\n",
116
+ " return len(self.word2idx)\n",
117
+ "\n",
118
+ "# ─────────────────────��───────────────────────\n",
119
+ "# Dataset\n",
120
+ "# ─────────────────────────────────────────────\n",
121
+ "class Flickr8kDataset(Dataset):\n",
122
+ " def __init__(self, df, img_dir, vocab, transform):\n",
123
+ " self.vocab = vocab\n",
124
+ " self.transform = transform\n",
125
+ " self.img_dir = img_dir\n",
126
+ " self.samples = list(zip(df['image'], df['caption']))\n",
127
+ "\n",
128
+ " def __len__(self):\n",
129
+ " return len(self.samples)\n",
130
+ "\n",
131
+ " def __getitem__(self, idx):\n",
132
+ " img_name, cap = self.samples[idx]\n",
133
+ " img_path = os.path.join(self.img_dir, img_name)\n",
134
+ "\n",
135
+ " img = Image.open(img_path).convert(\"RGB\")\n",
136
+ " img = self.transform(img)\n",
137
+ " img = img.view(-1)\n",
138
+ "\n",
139
+ " ids = self.vocab.encode(cap)\n",
140
+ " ids = ids[:MAX_SEQ_LEN] + [self.vocab.word2idx[PAD]] * max(0, MAX_SEQ_LEN - len(ids))\n",
141
+ "\n",
142
+ " return img, torch.tensor(ids, dtype=torch.long)\n",
143
+ "\n",
144
+ "# ─────────────────────────────────────────────\n",
145
+ "# Vanilla RNN Cell\n",
146
+ "# ─────────────────────────────────────────────\n",
147
+ "class VanillaRNNCell(nn.Module):\n",
148
+ " def __init__(self, input_dim, hidden_dim):\n",
149
+ " super().__init__()\n",
150
+ " self.W_ih = nn.Linear(input_dim, hidden_dim)\n",
151
+ " self.W_hh = nn.Linear(hidden_dim, hidden_dim, bias=False)\n",
152
+ "\n",
153
+ " def forward(self, x, h):\n",
154
+ " return torch.tanh(self.W_ih(x) + self.W_hh(h))\n",
155
+ "\n",
156
+ "# ─────────────────────────────────────────────\n",
157
+ "# Model\n",
158
+ "# ─────────────────────────────────────────────\n",
159
+ "class VanillaRNNCaptioner(nn.Module):\n",
160
+ " def __init__(self, vocab_size, pixel_dim, embed_dim, hidden_dim):\n",
161
+ " super().__init__()\n",
162
+ "\n",
163
+ " self.img_proj = nn.Linear(pixel_dim, hidden_dim)\n",
164
+ " self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)\n",
165
+ " self.cell = VanillaRNNCell(embed_dim, hidden_dim)\n",
166
+ " self.fc_out = nn.Linear(hidden_dim, vocab_size)\n",
167
+ " self.dropout = nn.Dropout(0.3)\n",
168
+ "\n",
169
+ " def forward(self, pixels, captions):\n",
170
+ " h = torch.tanh(self.img_proj(pixels))\n",
171
+ " outputs = []\n",
172
+ "\n",
173
+ " for t in range(captions.size(1) - 1):\n",
174
+ " x = self.dropout(self.embed(captions[:, t]))\n",
175
+ " h = self.cell(x, h)\n",
176
+ " outputs.append(self.fc_out(h))\n",
177
+ "\n",
178
+ " return torch.stack(outputs, dim=1)\n",
179
+ "\n",
180
+ " @torch.no_grad()\n",
181
+ " def generate(self, pixels, vocab, max_len=MAX_SEQ_LEN):\n",
182
+ " B = pixels.size(0)\n",
183
+ " h = torch.tanh(self.img_proj(pixels))\n",
184
+ "\n",
185
+ " inp = torch.full((B,), vocab.word2idx[SOS], dtype=torch.long, device=DEVICE)\n",
186
+ " result = []\n",
187
+ "\n",
188
+ " for _ in range(max_len):\n",
189
+ " x = self.embed(inp)\n",
190
+ " h = self.cell(x, h)\n",
191
+ " pred = self.fc_out(h).argmax(dim=-1)\n",
192
+ "\n",
193
+ " result.append(pred)\n",
194
+ " inp = pred\n",
195
+ "\n",
196
+ " return [vocab.decode(torch.stack(result, dim=1)[i].tolist()) for i in range(B)]\n",
197
+ "\n",
198
+ "# ─────────────────────────────────────────────\n",
199
+ "# Train / Eval\n",
200
+ "# ─────────────────────────────────────────────\n",
201
+ "def train_epoch(model, loader, optimizer, criterion, vocab):\n",
202
+ " model.train()\n",
203
+ " total_loss, total_tok = 0.0, 0\n",
204
+ " pad = vocab.word2idx[PAD]\n",
205
+ "\n",
206
+ " for pixels, caps in loader:\n",
207
+ " pixels, caps = pixels.to(DEVICE), caps.to(DEVICE)\n",
208
+ "\n",
209
+ " optimizer.zero_grad()\n",
210
+ " logits = model(pixels, caps)\n",
211
+ "\n",
212
+ " B, T, V = logits.shape\n",
213
+ " loss = criterion(logits.reshape(B*T, V), caps[:, 1:].reshape(B*T))\n",
214
+ "\n",
215
+ " loss.backward()\n",
216
+ " nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)\n",
217
+ " optimizer.step()\n",
218
+ "\n",
219
+ " mask = caps[:, 1:] != pad\n",
220
+ " total_loss += loss.item() * mask.sum().item()\n",
221
+ " total_tok += mask.sum().item()\n",
222
+ "\n",
223
+ " return total_loss / total_tok\n",
224
+ "\n",
225
+ "\n",
226
+ "@torch.no_grad()\n",
227
+ "def eval_epoch(model, loader, criterion, vocab):\n",
228
+ " model.eval()\n",
229
+ " total_loss, total_tok = 0.0, 0\n",
230
+ " pad = vocab.word2idx[PAD]\n",
231
+ "\n",
232
+ " for pixels, caps in loader:\n",
233
+ " pixels, caps = pixels.to(DEVICE), caps.to(DEVICE)\n",
234
+ "\n",
235
+ " logits = model(pixels, caps)\n",
236
+ " B, T, V = logits.shape\n",
237
+ "\n",
238
+ " loss = criterion(logits.reshape(B*T, V), caps[:, 1:].reshape(B*T))\n",
239
+ "\n",
240
+ " mask = caps[:, 1:] != pad\n",
241
+ " total_loss += loss.item() * mask.sum().item()\n",
242
+ " total_tok += mask.sum().item()\n",
243
+ "\n",
244
+ " return total_loss / total_tok\n",
245
+ "\n",
246
+ "# ─────────────────────────────────────────────\n",
247
+ "# Main\n",
248
+ "# ─────────────────────────────────────────────\n",
249
+ "def main():\n",
250
+ " print(\"Downloading dataset...\")\n",
251
+ " data_dir = kagglehub.dataset_download(\"adityajn105/flickr8k\")\n",
252
+ "\n",
253
+ " print(\"Dataset path:\", data_dir)\n",
254
+ "\n",
255
+ " img_dir = os.path.join(data_dir, \"Images\")\n",
256
+ " csv_path = os.path.join(data_dir, \"captions.txt\")\n",
257
+ "\n",
258
+ " # fallback search\n",
259
+ " if not os.path.exists(csv_path):\n",
260
+ " for root, dirs, files in os.walk(data_dir):\n",
261
+ " if \"captions.txt\" in files:\n",
262
+ " csv_path = os.path.join(root, \"captions.txt\")\n",
263
+ " if \"Images\" in dirs:\n",
264
+ " img_dir = os.path.join(root, \"Images\")\n",
265
+ "\n",
266
+ " print(\"Images dir:\", img_dir)\n",
267
+ " print(\"Captions file:\", csv_path)\n",
268
+ "\n",
269
+ " if not os.path.exists(csv_path):\n",
270
+ " raise FileNotFoundError(\"captions.txt not found!\")\n",
271
+ "\n",
272
+ " df = pd.read_csv(csv_path)\n",
273
+ "\n",
274
+ " if df.shape[1] == 1:\n",
275
+ " df = pd.read_csv(csv_path, sep=\",\", names=[\"image\", \"caption\"], skiprows=1)\n",
276
+ "\n",
277
+ " df[\"image\"] = df[\"image\"].apply(lambda x: x.split(\"#\")[0])\n",
278
+ "\n",
279
+ " # split\n",
280
+ " n = len(df)\n",
281
+ " n_train = int(0.9 * n)\n",
282
+ "\n",
283
+ " train_df = df.iloc[:n_train]\n",
284
+ " val_df = df.iloc[n_train:]\n",
285
+ "\n",
286
+ " # vocab\n",
287
+ " vocab = Vocabulary()\n",
288
+ " vocab.build(train_df[\"caption\"].tolist())\n",
289
+ "\n",
290
+ " print(f\"Vocab size: {len(vocab)}\")\n",
291
+ "\n",
292
+ " # transforms\n",
293
+ " tfm = transforms.Compose([\n",
294
+ " transforms.Resize((IMG_SIZE, IMG_SIZE)),\n",
295
+ " transforms.ToTensor()\n",
296
+ " ])\n",
297
+ "\n",
298
+ " # datasets\n",
299
+ " train_set = Flickr8kDataset(train_df, img_dir, vocab, tfm)\n",
300
+ " val_set = Flickr8kDataset(val_df, img_dir, vocab, tfm)\n",
301
+ "\n",
302
+ " train_loader = DataLoader(train_set, BATCH_SIZE, shuffle=True, drop_last=True)\n",
303
+ " val_loader = DataLoader(val_set, BATCH_SIZE)\n",
304
+ "\n",
305
+ " # model\n",
306
+ " model = VanillaRNNCaptioner(len(vocab), PIXEL_DIM, EMBED_DIM, HIDDEN_DIM).to(DEVICE)\n",
307
+ "\n",
308
+ " optimizer = optim.Adam(model.parameters(), lr=LR)\n",
309
+ " scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)\n",
310
+ " criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx[PAD])\n",
311
+ "\n",
312
+ " best_val = math.inf\n",
313
+ "\n",
314
+ " # training loop\n",
315
+ " for epoch in range(1, EPOCHS + 1):\n",
316
+ " t0 = time.time()\n",
317
+ "\n",
318
+ " train_loss = train_epoch(model, train_loader, optimizer, criterion, vocab)\n",
319
+ " val_loss = eval_epoch(model, val_loader, criterion, vocab)\n",
320
+ "\n",
321
+ " scheduler.step()\n",
322
+ "\n",
323
+ " print(f\"Epoch {epoch:02d} | Train {train_loss:.4f} | Val {val_loss:.4f}\")\n",
324
+ "\n",
325
+ " if val_loss < best_val:\n",
326
+ " best_val = val_loss\n",
327
+ " torch.save({\"model\": model.state_dict(), \"vocab\": vocab}, SAVE_PATH)\n",
328
+ " print(\"Saved model\")\n",
329
+ "\n",
330
+ "if __name__ == \"__main__\":\n",
331
+ " main()"
332
+ ],
333
+ "metadata": {
334
+ "id": "vnrnDRn3I-Cu",
335
+ "colab": {
336
+ "base_uri": "https://localhost:8080/"
337
+ },
338
+ "outputId": "1df4e60b-6ea2-420c-c016-66159273ad3c"
339
+ },
340
+ "execution_count": 1,
341
+ "outputs": [
342
+ {
343
+ "output_type": "stream",
344
+ "name": "stdout",
345
+ "text": [
346
+ "Using device: cuda\n",
347
+ "Downloading dataset...\n",
348
+ "Using Colab cache for faster access to the 'flickr8k' dataset.\n",
349
+ "Dataset path: /kaggle/input/flickr8k\n",
350
+ "Images dir: /kaggle/input/flickr8k/Images\n",
351
+ "Captions file: /kaggle/input/flickr8k/captions.txt\n",
352
+ "Vocab size: 5001\n",
353
+ "Epoch 01 | Train 4.3251 | Val 3.6760\n",
354
+ "Saved model\n",
355
+ "Epoch 02 | Train 3.6269 | Val 3.4343\n",
356
+ "Saved model\n",
357
+ "Epoch 03 | Train 3.4227 | Val 3.3320\n",
358
+ "Saved model\n",
359
+ "Epoch 04 | Train 3.2967 | Val 3.2652\n",
360
+ "Saved model\n",
361
+ "Epoch 05 | Train 3.2038 | Val 3.2251\n",
362
+ "Saved model\n",
363
+ "Epoch 06 | Train 3.1303 | Val 3.1986\n",
364
+ "Saved model\n",
365
+ "Epoch 07 | Train 3.0692 | Val 3.1696\n",
366
+ "Saved model\n",
367
+ "Epoch 08 | Train 3.0177 | Val 3.1493\n",
368
+ "Saved model\n",
369
+ "Epoch 09 | Train 2.9719 | Val 3.1336\n",
370
+ "Saved model\n",
371
+ "Epoch 10 | Train 2.9328 | Val 3.1252\n",
372
+ "Saved model\n",
373
+ "Epoch 11 | Train 2.8984 | Val 3.1244\n",
374
+ "Saved model\n",
375
+ "Epoch 12 | Train 2.8679 | Val 3.1102\n",
376
+ "Saved model\n",
377
+ "Epoch 13 | Train 2.8444 | Val 3.1026\n",
378
+ "Saved model\n",
379
+ "Epoch 14 | Train 2.8215 | Val 3.1084\n",
380
+ "Epoch 15 | Train 2.8054 | Val 3.1011\n",
381
+ "Saved model\n",
382
+ "Epoch 16 | Train 2.7908 | Val 3.1013\n",
383
+ "Epoch 17 | Train 2.7802 | Val 3.1018\n",
384
+ "Epoch 18 | Train 2.7714 | Val 3.0995\n",
385
+ "Saved model\n",
386
+ "Epoch 19 | Train 2.7661 | Val 3.0990\n",
387
+ "Saved model\n",
388
+ "Epoch 20 | Train 2.7650 | Val 3.0995\n"
389
+ ]
390
+ }
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "source": [
396
+ "# ─────────────────────────────────────────────\n",
397
+ "# UI + Inference Cell\n",
398
+ "# ─────────────────────────────────────────────\n",
399
+ "import gradio as gr\n",
400
+ "\n",
401
+ "# Load trained model\n",
402
+ "def load_model():\n",
403
+ " checkpoint = torch.load(SAVE_PATH, map_location=DEVICE, weights_only=False)\n",
404
+ " vocab = checkpoint[\"vocab\"]\n",
405
+ "\n",
406
+ " model = VanillaRNNCaptioner(\n",
407
+ " len(vocab),\n",
408
+ " PIXEL_DIM,\n",
409
+ " EMBED_DIM,\n",
410
+ " HIDDEN_DIM\n",
411
+ " ).to(DEVICE)\n",
412
+ "\n",
413
+ " model.load_state_dict(checkpoint[\"model\"])\n",
414
+ " model.eval()\n",
415
+ "\n",
416
+ " return model, vocab\n",
417
+ "\n",
418
+ "model, vocab = load_model()\n",
419
+ "\n",
420
+ "# Transform (same as training)\n",
421
+ "tfm = transforms.Compose([\n",
422
+ " transforms.Resize((IMG_SIZE, IMG_SIZE)),\n",
423
+ " transforms.ToTensor()\n",
424
+ "])\n",
425
+ "\n",
426
+ "# Prediction function\n",
427
+ "def predict(image):\n",
428
+ " image = image.convert(\"RGB\")\n",
429
+ " image = tfm(image)\n",
430
+ " image = image.view(1, -1).to(DEVICE)\n",
431
+ "\n",
432
+ " caption = model.generate(image, vocab)[0]\n",
433
+ " return caption\n",
434
+ "\n",
435
+ "# Gradio UI\n",
436
+ "demo = gr.Interface(\n",
437
+ " fn=predict,\n",
438
+ " inputs=gr.Image(type=\"pil\"),\n",
439
+ " outputs=\"text\",\n",
440
+ " title=\"Image Captioning (Vanilla RNN)\",\n",
441
+ " description=\"Upload an image β†’ get caption\"\n",
442
+ ")\n",
443
+ "\n",
444
+ "demo.launch()\n"
445
+ ],
446
+ "metadata": {
447
+ "id": "QEycwtwaSTy4",
448
+ "colab": {
449
+ "base_uri": "https://localhost:8080/",
450
+ "height": 648
451
+ },
452
+ "outputId": "e9ab980b-e575-4926-ca47-b396171bae50"
453
+ },
454
+ "execution_count": 3,
455
+ "outputs": [
456
+ {
457
+ "output_type": "stream",
458
+ "name": "stdout",
459
+ "text": [
460
+ "It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
461
+ "\n",
462
+ "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n",
463
+ "* Running on public URL: https://dd5611461a17776d59.gradio.live\n",
464
+ "\n",
465
+ "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
466
+ ]
467
+ },
468
+ {
469
+ "output_type": "display_data",
470
+ "data": {
471
+ "text/plain": [
472
+ "<IPython.core.display.HTML object>"
473
+ ],
474
+ "text/html": [
475
+ "<div><iframe src=\"https://dd5611461a17776d59.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
476
+ ]
477
+ },
478
+ "metadata": {}
479
+ },
480
+ {
481
+ "output_type": "execute_result",
482
+ "data": {
483
+ "text/plain": []
484
+ },
485
+ "metadata": {},
486
+ "execution_count": 3
487
+ }
488
+ ]
489
+ }
490
+ ]
491
+ }
vanilla_rnn_captioning.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e13fede879323925196e9bf03a7b7c6c25e56486c66d5b5f2329eec42d649447
3
+ size 23391519