{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":115439,"databundleVersionId":13800781,"sourceType":"competition"},{"sourceId":13896942,"sourceType":"datasetVersion","datasetId":8852742}],"dockerImageVersionId":31193,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\n# 5cfcb5e8ef8458be6e85d57c45c7573477e2ad6a\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.708175Z","iopub.execute_input":"2025-11-27T14:51:35.708464Z","iopub.status.idle":"2025-11-27T14:51:35.723958Z","shell.execute_reply.started":"2025-11-27T14:51:35.708440Z","shell.execute_reply":"2025-11-27T14:51:35.723025Z"}},"outputs":[{"name":"stdout","text":"/kaggle/input/sub-files-dlgenai/submission_f24.csv\n/kaggle/input/sub-files-dlgenai/submission_distB.csv\n/kaggle/input/sub-files-dlgenai/submission_f46.csv\n/kaggle/input/sub-files-dlgenai/submission_bert_5.csv\n/kaggle/input/sub-files-dlgenai/submission_f46.1.csv\n/kaggle/input/sub-files-dlgenai/submission_bert_3.csv\n/kaggle/input/sub-files-dlgenai/submission_distB_1.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/train.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"df = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/train.csv\") #training set\ndt = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\") #test set","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.725326Z","iopub.execute_input":"2025-11-27T14:51:35.725591Z","iopub.status.idle":"2025-11-27T14:51:35.783046Z","shell.execute_reply.started":"2025-11-27T14:51:35.725572Z","shell.execute_reply":"2025-11-27T14:51:35.782339Z"}},"outputs":[],"execution_count":3},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\nlabel_cols = ['anger','fear','joy','sadness','surprise']\n\nxtrain, xval, ytrain, yval = train_test_split(\n df['text'],\n df[label_cols].values,\n test_size=0.2,\n random_state=42,\n shuffle=True\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.783712Z","iopub.execute_input":"2025-11-27T14:51:35.783915Z","iopub.status.idle":"2025-11-27T14:51:36.563927Z","shell.execute_reply.started":"2025-11-27T14:51:35.783889Z","shell.execute_reply":"2025-11-27T14:51:36.562024Z"}},"outputs":[],"execution_count":4},{"cell_type":"code","source":"# import wandb\n# wandb.login(key=os.environ.get(\"WB_TOKEN\"))\n# # wandb.init(project=\"22f3001086-t32025\", name = \"BERT+Classifier head\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.565603Z","iopub.execute_input":"2025-11-27T14:51:36.566097Z","iopub.status.idle":"2025-11-27T14:51:36.570291Z","shell.execute_reply.started":"2025-11-27T14:51:36.566068Z","shell.execute_reply":"2025-11-27T14:51:36.569665Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"# # !pip install -q transformers wandb\n\n# import os\n# import random\n# import numpy as np\n# import pandas as pd\n# import torch\n# import torch.nn as nn\n# from torch.utils.data import Dataset, DataLoader\n# from sklearn.model_selection import train_test_split\n# from sklearn.metrics import f1_score\n\n# from transformers import (\n# DistilBertTokenizerFast,\n# DistilBertModel,\n# get_linear_schedule_with_warmup,\n# )\n# from torch.optim import AdamW # use PyTorch AdamW, not transformers\n# print(\"Done!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.570942Z","iopub.execute_input":"2025-11-27T14:51:36.571131Z","iopub.status.idle":"2025-11-27T14:51:36.592661Z","shell.execute_reply.started":"2025-11-27T14:51:36.571114Z","shell.execute_reply":"2025-11-27T14:51:36.591781Z"}},"outputs":[],"execution_count":6},{"cell_type":"code","source":"# SEED = 42\n# random.seed(SEED)\n# np.random.seed(SEED)\n# torch.manual_seed(SEED)\n# torch.cuda.manual_seed_all(SEED)\n\n# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n# print(\"Device:\", device)\n\n# label_cols = ['anger','fear','joy','sadness','surprise']\n\n# config = {\n# \"model_name\": \"distilbert-base-uncased\",\n# \"max_length\": 128,\n# \"batch_size\": 16,\n# \"lr\": 1e-5,\n# \"weight_decay\": 0.01,\n# \"epochs\": 5,\n# \"warmup_ratio\": 0.1,\n# }","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.593480Z","iopub.execute_input":"2025-11-27T14:51:36.593762Z","iopub.status.idle":"2025-11-27T14:51:36.622469Z","shell.execute_reply.started":"2025-11-27T14:51:36.593739Z","shell.execute_reply":"2025-11-27T14:51:36.621017Z"}},"outputs":[],"execution_count":7},{"cell_type":"code","source":"# tokenizer = DistilBertTokenizerFast.from_pretrained(config[\"model_name\"])\n\n# class EmotionDataset(Dataset):\n# def __init__(self, texts, labels, tokenizer, max_length):\n# self.texts = list(texts)\n# self.labels = labels\n# self.tokenizer = tokenizer\n# self.max_length = max_length\n\n# def __len__(self):\n# return len(self.texts)\n\n# def __getitem__(self, idx):\n# text = str(self.texts[idx])\n# encoding = self.tokenizer(\n# text,\n# truncation=True,\n# padding=\"max_length\",\n# max_length=self.max_length,\n# return_tensors=\"pt\",\n# )\n# item = {k: v.squeeze(0) for k, v in encoding.items()}\n# item[\"labels\"] = torch.tensor(self.labels[idx], dtype=torch.float32)\n# return item\n\n# train_ds = EmotionDataset(xtrain, ytrain, tokenizer, config[\"max_length\"])\n# val_ds = EmotionDataset(xval, yval, tokenizer, config[\"max_length\"])\n\n# train_loader = DataLoader(train_ds, batch_size=config[\"batch_size\"], shuffle=True)\n# val_loader = DataLoader(val_ds, batch_size=config[\"batch_size\"], shuffle=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.623711Z","iopub.execute_input":"2025-11-27T14:51:36.623979Z","iopub.status.idle":"2025-11-27T14:51:36.643341Z","shell.execute_reply.started":"2025-11-27T14:51:36.623956Z","shell.execute_reply":"2025-11-27T14:51:36.641731Z"}},"outputs":[],"execution_count":8},{"cell_type":"code","source":"# class DistilBertForMultiLabel(nn.Module):\n# def __init__(self, model_name, num_labels, dropout=0.5):\n# super().__init__()\n# self.distilbert = DistilBertModel.from_pretrained(model_name)\n# hidden_size = self.distilbert.config.hidden_size\n# self.dropout = nn.Dropout(dropout)\n# self.classifier = nn.Linear(hidden_size, num_labels)\n\n# def forward(self, input_ids, attention_mask):\n# outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)\n# cls_hidden = outputs.last_hidden_state[:, 0, :] # [CLS]-like token\n# x = self.dropout(cls_hidden)\n# logits = self.classifier(x)\n# return logits\n\n# model = DistilBertForMultiLabel(config[\"model_name\"], num_labels=len(label_cols))\n# model.to(device)\n\n# criterion = nn.BCEWithLogitsLoss()\n# optimizer = AdamW(model.parameters(), lr=config[\"lr\"], weight_decay=config[\"weight_decay\"])","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.644220Z","iopub.execute_input":"2025-11-27T14:51:36.644437Z","iopub.status.idle":"2025-11-27T14:51:36.669101Z","shell.execute_reply.started":"2025-11-27T14:51:36.644421Z","shell.execute_reply":"2025-11-27T14:51:36.668362Z"}},"outputs":[],"execution_count":9},{"cell_type":"code","source":"# num_training_steps = config[\"epochs\"] * len(train_loader)\n# num_warmup_steps = int(config[\"warmup_ratio\"] * num_training_steps)\n\n# scheduler = get_linear_schedule_with_warmup(\n# optimizer,\n# num_warmup_steps=num_warmup_steps,\n# num_training_steps=num_training_steps,\n# )\n\n# def evaluate(model, loader):\n# model.eval()\n# all_logits = []\n# all_labels = []\n# with torch.no_grad():\n# for batch in loader:\n# input_ids = batch[\"input_ids\"].to(device)\n# attention_mask = batch[\"attention_mask\"].to(device)\n# labels = batch[\"labels\"].to(device)\n\n# logits = model(input_ids=input_ids, attention_mask=attention_mask)\n# all_logits.append(logits.cpu())\n# all_labels.append(labels.cpu())\n\n# all_logits = torch.cat(all_logits)\n# all_labels = torch.cat(all_labels)\n\n# probs = torch.sigmoid(all_logits).numpy()\n# preds = (probs >= 0.5).astype(int)\n# y_true = all_labels.numpy()\n\n# macro_f1 = f1_score(y_true, preds, average=\"macro\")\n# return macro_f1","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.669860Z","iopub.execute_input":"2025-11-27T14:51:36.670110Z","iopub.status.idle":"2025-11-27T14:51:36.694247Z","shell.execute_reply.started":"2025-11-27T14:51:36.670089Z","shell.execute_reply":"2025-11-27T14:51:36.693111Z"}},"outputs":[],"execution_count":10},{"cell_type":"code","source":"# best_f1 = 0.0\n# wandb.init(project=\"22f3001086-t32025\", config=config, name = \"distilbert 1\")\n\n# for epoch in range(1, config[\"epochs\"] + 1):\n# model.train()\n# total_loss = 0.0\n\n# for batch in train_loader:\n# input_ids = batch[\"input_ids\"].to(device)\n# attention_mask = batch[\"attention_mask\"].to(device)\n# labels = batch[\"labels\"].to(device)\n\n# optimizer.zero_grad()\n# logits = model(input_ids=input_ids, attention_mask=attention_mask)\n# loss = criterion(logits, labels)\n# loss.backward()\n# optimizer.step()\n# scheduler.step()\n\n# total_loss += loss.item() * input_ids.size(0)\n\n# train_loss = total_loss / len(train_ds)\n# val_f1 = evaluate(model, val_loader)\n\n# print(f\"Epoch {epoch}: train_loss={train_loss:.4f} val_macro_f1={val_f1:.4f}\")\n\n# wandb.log({\n# \"epoch\": epoch,\n# \"train_loss\": train_loss,\n# \"val_macro_f1\": val_f1,\n# \"learning_rate\": optimizer.param_groups[0][\"lr\"],\n# })\n\n# if val_f1 > best_f1:\n# best_f1 = val_f1\n# torch.save(model.state_dict(), \"distilbert_best 1.pt\")\n# print(\" -> new best model saved\")\n\n# wandb.finish()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.696233Z","iopub.execute_input":"2025-11-27T14:51:36.696646Z","iopub.status.idle":"2025-11-27T14:51:36.712255Z","shell.execute_reply.started":"2025-11-27T14:51:36.696626Z","shell.execute_reply":"2025-11-27T14:51:36.711609Z"}},"outputs":[],"execution_count":11},{"cell_type":"code","source":"# import re\n# import torch\n# import torch.nn as nn\n# import pandas as pd\n# import numpy as np\n# from transformers import DistilBertTokenizerFast, DistilBertModel\n\n# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n# print(\"Device:\", device)\n\n# label_cols = ['anger','fear','joy','sadness','surprise']\n# model_name = \"distilbert-base-uncased\"\n\n# tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n\n# class DistilBertForMultiLabel(nn.Module):\n# def __init__(self, model_name, num_labels, dropout=0.3):\n# super().__init__()\n# self.distilbert = DistilBertModel.from_pretrained(model_name)\n# hidden_size = self.distilbert.config.hidden_size\n# self.dropout = nn.Dropout(dropout)\n# self.classifier = nn.Linear(hidden_size, num_labels)\n\n# def forward(self, input_ids, attention_mask):\n# outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)\n# cls_hidden = outputs.last_hidden_state[:, 0, :]\n# x = self.dropout(cls_hidden)\n# logits = self.classifier(x)\n# return logits\n\n# model = DistilBertForMultiLabel(model_name, num_labels=len(label_cols)).to(device)\n# state_dict = torch.load(\n# \"/kaggle/working/distilbert_best 1.pt\", # your dataset path\n# map_location=device,\n# )\n# model.load_state_dict(state_dict)\n# model.eval()\n\n# test_df = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\")\n\n# all_preds = []\n# batch_size = 32\n# max_length = 128\n\n# with torch.no_grad():\n# for i in range(0, len(test_df), batch_size):\n# texts = test_df[\"text\"].iloc[i:i+batch_size].tolist()\n# enc = tokenizer(\n# texts,\n# truncation=True,\n# padding=\"max_length\",\n# max_length=max_length,\n# return_tensors=\"pt\",\n# )\n# input_ids = enc[\"input_ids\"].to(device)\n# attention_mask = enc[\"attention_mask\"].to(device)\n\n# logits = model(input_ids=input_ids, attention_mask=attention_mask)\n# probs = torch.sigmoid(logits).cpu().numpy()\n# preds = (probs >= 0.5).astype(int)\n# all_preds.append(preds)\n\n# all_preds = np.vstack(all_preds)\n# sub = pd.DataFrame(all_preds, columns=label_cols)\n# sub.insert(0, \"id\", test_df[\"id\"])\n# sub.to_csv(\"submission_distB_1.csv\", index=False)\n# sub.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.712938Z","iopub.execute_input":"2025-11-27T14:51:36.713169Z","iopub.status.idle":"2025-11-27T14:51:36.736737Z","shell.execute_reply.started":"2025-11-27T14:51:36.713150Z","shell.execute_reply":"2025-11-27T14:51:36.735673Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"import pandas as pd\nimport os\n\n# 1. Load some CSV from the input directory (example path)\ninput_path = \"/kaggle/input/sub-files-dlgenai/submission_distB_1.csv\"\ndf_sub = pd.read_csv(input_path)\n\n# Optional: quick sanity check\nprint(df_sub.head())\nprint(\"\\nColumns :\")\nprint(df_sub.columns)\n\n# 2. Save it as the competition submission file in the working/output dir\ndf_sub.to_csv(\"submission.csv\", index=False)\n\nprint(\"Done!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:53:14.946885Z","iopub.execute_input":"2025-11-27T14:53:14.947196Z","iopub.status.idle":"2025-11-27T14:53:14.969737Z","shell.execute_reply.started":"2025-11-27T14:53:14.947176Z","shell.execute_reply":"2025-11-27T14:53:14.968844Z"}},"outputs":[{"name":"stdout","text":" id anger fear joy sadness surprise\n0 0 1 1 0 1 0\n1 1 0 0 0 0 0\n2 2 1 1 0 0 1\n3 3 0 1 0 0 0\n4 4 0 1 0 0 1\n\nColumns :\nIndex(['id', 'anger', 'fear', 'joy', 'sadness', 'surprise'], dtype='object')\nDone!\n","output_type":"stream"}],"execution_count":14}]}