{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"accelerate_config":{"num_processes":2}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"932ffe2c-915d-4254-b4dd-1d32bfeb87db","cell_type":"markdown","source":"# Java Code Optimization — CodeT5-small Fine-tuning\n**Kaggle T4×2 · Dual-GPU DataParallel · ~6 K train / 680 val pairs**\n\nPipeline:\n1. Load dataset from `dataset_train.jsonl` / `dataset_val.jsonl`\n2. Tokenize with `Salesforce/codet5-small` tokenizer \n3. Fine-tune with `Seq2SeqTrainer` + both GPUs via `DataParallel`\n4. Evaluate with BLEU + CodeBLEU proxies \n5. Push to HuggingFace Hub (optional) and save artefacts","metadata":{}},{"id":"10c9c8fb-0f60-4876-9110-7df1fc24b0aa","cell_type":"code","source":"import subprocess, sys\n\npkgs = [\n \"transformers==4.41.2\",\n \"datasets==2.20.0\",\n \"evaluate==0.4.2\",\n \"sacrebleu==2.4.3\",\n \"accelerate==0.33.0\",\n \"peft==0.11.1\",\n \"sentencepiece==0.2.0\",\n \"rouge_score==0.1.2\",\n \"tokenizers==0.19.1\",\n]\n\nsubprocess.run(\n [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"--force-reinstall\"] + pkgs\n)\n\nprint(\"✅ Install done. RESTART KERNEL NOW.\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"e8ea6ed7-c9ef-4a0f-8cbb-457c1b599466","cell_type":"code","source":"import os\n\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\" # ✅ FORCE SINGLE GPU\nos.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\" # debug + stability","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T12:59:28.032986Z","iopub.execute_input":"2026-04-03T12:59:28.033907Z","iopub.status.idle":"2026-04-03T12:59:28.038196Z","shell.execute_reply.started":"2026-04-03T12:59:28.033868Z","shell.execute_reply":"2026-04-03T12:59:28.037332Z"}},"outputs":[],"execution_count":14},{"id":"5712ecaa-b2af-4446-a009-e210d4897a12","cell_type":"code","source":"import os, json, random, time\nfrom pathlib import Path\nfrom dataclasses import dataclass\n\nimport numpy as np\nimport torch\n\nfrom transformers import (\n T5ForConditionalGeneration,\n RobertaTokenizer,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n DataCollatorForSeq2Seq,\n)\n\nfrom datasets import Dataset as HFDataset, DatasetDict\n\nprint(\"CUDA:\", torch.cuda.is_available())\nprint(\"GPUs:\", torch.cuda.device_count())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T12:59:28.247024Z","iopub.execute_input":"2026-04-03T12:59:28.247805Z","iopub.status.idle":"2026-04-03T12:59:28.253552Z","shell.execute_reply.started":"2026-04-03T12:59:28.247769Z","shell.execute_reply":"2026-04-03T12:59:28.252760Z"}},"outputs":[{"name":"stdout","text":"CUDA: True\nGPUs: 1\n","output_type":"stream"}],"execution_count":15},{"id":"ad8ca487-c537-49ba-a1ff-0b124837ced8","cell_type":"code","source":"@dataclass\nclass CFG:\n model_name: str = \"Salesforce/codet5-small\"\n\n train_file: str = \"/kaggle/input/datasets/suhaskoheda/java-optimisation/dataset_train.jsonl\"\n val_file: str = \"/kaggle/input/datasets/suhaskoheda/java-optimisation/dataset_val.jsonl\"\n\n max_source_length: int = 128\n max_target_length: int = 128\n\n num_train_epochs: int = 5 # 🔥 fast\n\n per_device_train_batch_size: int = 16\n per_device_eval_batch_size: int = 32\n\n learning_rate: float = 5e-4\n weight_decay: float = 0.01\n warmup_ratio: float = 0.05\n\n output_dir: str = \"/kaggle/working/codet5-fast\"\n\ncfg = CFG()\nPath(cfg.output_dir).mkdir(parents=True, exist_ok=True)\n\nprint(\"✅ Config ready\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T12:59:28.411634Z","iopub.execute_input":"2026-04-03T12:59:28.412174Z","iopub.status.idle":"2026-04-03T12:59:28.419593Z","shell.execute_reply.started":"2026-04-03T12:59:28.412144Z","shell.execute_reply":"2026-04-03T12:59:28.418710Z"}},"outputs":[{"name":"stdout","text":"✅ Config ready\n","output_type":"stream"}],"execution_count":16},{"id":"2a3aae51-9c02-4b13-b4f7-7e95e3c5084e","cell_type":"code","source":"def load_jsonl(path):\n data = []\n with open(path, encoding=\"utf-8\") as f:\n for line in f:\n if line.strip():\n data.append(json.loads(line))\n return data\n\ntrain_raw = load_jsonl(cfg.train_file)\nval_raw = load_jsonl(cfg.val_file)\n\nprint(\"Train:\", len(train_raw))\nprint(\"Val:\", len(val_raw))\n\nprint(\"\\nSample:\")\nprint(train_raw[0][\"input\"][:200])","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T12:59:28.560595Z","iopub.execute_input":"2026-04-03T12:59:28.561419Z","iopub.status.idle":"2026-04-03T12:59:28.617144Z","shell.execute_reply.started":"2026-04-03T12:59:28.561339Z","shell.execute_reply":"2026-04-03T12:59:28.616172Z"}},"outputs":[{"name":"stdout","text":"Train: 6115\nVal: 680\n\nSample:\nString alertLevel;\nswitch (sensor.getValue()) {\n case 0:\n case 1:\n alertLevel = \"GREEN\";\n break;\n case 2:\n alertLevel = \"YELLOW\";\n break;\n case 3:\n alert\n","output_type":"stream"}],"execution_count":17},{"id":"4ead70f3-795f-42e9-863d-ac14024611d7","cell_type":"code","source":"from transformers import AutoTokenizer\n\ntokenizer = AutoTokenizer.from_pretrained(\n cfg.model_name,\n use_fast=False\n)\n\nTASK_PREFIX = \"Optimize Java: \"\n\ndef tokenize_batch(examples):\n inputs = [TASK_PREFIX + x for x in examples[\"input\"]]\n targets = examples[\"output\"]\n\n model_inputs = tokenizer(\n inputs,\n max_length=cfg.max_source_length,\n truncation=True,\n padding=\"max_length\",\n )\n\n labels = tokenizer(\n targets,\n max_length=cfg.max_target_length,\n truncation=True,\n padding=\"max_length\",\n )\n\n # 🔥 CRITICAL FIX\n cleaned_labels = []\n for label in labels[\"input_ids\"]:\n cleaned = []\n for token in label:\n if token == tokenizer.pad_token_id:\n cleaned.append(-100)\n else:\n cleaned.append(int(token)) # 🔥 force python int\n cleaned_labels.append(cleaned)\n\n model_inputs[\"labels\"] = cleaned_labels\n\n return model_inputs\n \nprint(\"✅ Tokenizer ready\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"id":"94508e52-a595-452b-89f2-cf85003d2109","cell_type":"code","source":"def to_hf(rows):\n return HFDataset.from_dict({\n \"input\": [r[\"input\"] for r in rows],\n \"output\": [r[\"output\"] for r in rows],\n })\n\ndataset = DatasetDict({\n \"train\": to_hf(train_raw),\n \"val\": to_hf(val_raw),\n})\n\ntokenized_ds = dataset.map(\n tokenize_batch,\n batched=True,\n remove_columns=[\"input\", \"output\"],\n)\n\ntokenized_ds.set_format(\"torch\")\ntokenized_ds = tokenized_ds.with_format(\"torch\")\nprint(\"✅ Dataset ready\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T13:02:56.439164Z","iopub.execute_input":"2026-04-03T13:02:56.440280Z","iopub.status.idle":"2026-04-03T13:03:01.939341Z","shell.execute_reply.started":"2026-04-03T13:02:56.440196Z","shell.execute_reply":"2026-04-03T13:03:01.938429Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/6115 [00:00","text/html":"\n
\n \n \n [1915/1915 11:50, Epoch 5/5]\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
StepTraining Loss
2000.145300
4000.149900
6000.112700
8000.092400
10000.074900
12000.073200
14000.070000
16000.061400
18000.044600

"},"metadata":{}},{"name":"stdout","text":"✅ Done in 11.9 min\n","output_type":"stream"}],"execution_count":33},{"id":"77c6ffc2-efae-437f-9f3d-fb9db0a07c76","cell_type":"code","source":"trainer.save_model(cfg.output_dir)\ntokenizer.save_pretrained(cfg.output_dir)\n\nprint(\"✅ Model saved\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T13:22:11.118944Z","iopub.execute_input":"2026-04-03T13:22:11.119273Z","iopub.status.idle":"2026-04-03T13:22:11.697695Z","shell.execute_reply.started":"2026-04-03T13:22:11.119244Z","shell.execute_reply":"2026-04-03T13:22:11.696982Z"}},"outputs":[{"name":"stdout","text":"✅ Model saved\n","output_type":"stream"}],"execution_count":34},{"id":"6537f70f-3c8d-444d-9dd8-f563abc2af0c","cell_type":"code","source":"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n\nmodel = model.to(device)\nmodel.eval()\n\ndef optimize_java(code):\n inputs = tokenizer(\n TASK_PREFIX + code,\n return_tensors=\"pt\",\n truncation=True,\n max_length=cfg.max_source_length,\n ).to(device)\n\n with torch.no_grad():\n outputs = model.generate(\n **inputs,\n max_new_tokens=128,\n )\n\n return tokenizer.decode(outputs[0], skip_special_tokens=True)\n\n\n# TEST\nprint(optimize_java(\"\"\"\nString alertLevel;\nswitch (sensor.getValue()) {\n case 0:\n case 1:\n alertLevel = \"GREEN\";\n break;\n case 2:\n alertLevel = \"YELLOW\";\n break;\n case 3:\n alertLevel = \"RED\";\n break;\n default:\n alertLevel = \"CRITICAL\";\n}\n\"\"\"))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T14:05:35.081973Z","iopub.execute_input":"2026-04-03T14:05:35.082240Z","iopub.status.idle":"2026-04-03T14:05:35.749507Z","shell.execute_reply.started":"2026-04-03T14:05:35.082214Z","shell.execute_reply":"2026-04-03T14:05:35.748745Z"}},"outputs":[{"name":"stdout","text":"String alertLevel = switch (sensor.getValue()) {\n case 0, 1 -> \"1\";\n case 2 -> \"2\";\n case 3 -> \"3\";\n default -> \"4\";\n};\n","output_type":"stream"}],"execution_count":37},{"id":"cb063d9a-5425-40cd-89de-c23edd1a60f3","cell_type":"code","source":"print(optimize_java(\"\"\"\nList payments = new ArrayList<>(pendingPayments);\nIterator iterator = payments.iterator();\nwhile (iterator.hasNext()) {\n Payment p = iterator.next();\n if (p.isExpired()) {\n iterator.remove();\n }\n}\"\"\"))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T14:06:17.153683Z","iopub.execute_input":"2026-04-03T14:06:17.154494Z","iopub.status.idle":"2026-04-03T14:06:17.546252Z","shell.execute_reply.started":"2026-04-03T14:06:17.154454Z","shell.execute_reply":"2026-04-03T14:06:17.545289Z"}},"outputs":[{"name":"stdout","text":"List payments = new ArrayList<>(pendingPayments);\npayments.removeIf(Payment::isExpired);\n","output_type":"stream"}],"execution_count":38},{"id":"ae99af62-1c8a-4adc-bdef-7c74c2da7696","cell_type":"code","source":"print(optimize_java(\"\"\"\nString result = \"\";\nfor (int i = 0; i < 100; i++) {\n result += \"Value: \" + i + \"\\n\";\n}\"\"\"))","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-03T14:08:05.318144Z","iopub.execute_input":"2026-04-03T14:08:05.318944Z","iopub.status.idle":"2026-04-03T14:08:06.065047Z","shell.execute_reply.started":"2026-04-03T14:08:05.318907Z","shell.execute_reply":"2026-04-03T14:08:06.064234Z"}},"outputs":[{"name":"stdout","text":"StringBuilder sb = new StringBuilder(100 * 8);\nfor (int i = 0; i < 100; i++) {\n sb.append(\"Value: \").append(i).append(\"\n\");\n}\nString result = sb.toString();\n","output_type":"stream"}],"execution_count":39}]}