{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":115439,"databundleVersionId":13800781,"sourceType":"competition"},{"sourceId":13896942,"sourceType":"datasetVersion","datasetId":8852742}],"dockerImageVersionId":31193,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\n# 5cfcb5e8ef8458be6e85d57c45c7573477e2ad6a\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.708175Z","iopub.execute_input":"2025-11-27T14:51:35.708464Z","iopub.status.idle":"2025-11-27T14:51:35.723958Z","shell.execute_reply.started":"2025-11-27T14:51:35.708440Z","shell.execute_reply":"2025-11-27T14:51:35.723025Z"}},"outputs":[{"name":"stdout","text":"/kaggle/input/sub-files-dlgenai/submission_f24.csv\n/kaggle/input/sub-files-dlgenai/submission_distB.csv\n/kaggle/input/sub-files-dlgenai/submission_f46.csv\n/kaggle/input/sub-files-dlgenai/submission_bert_5.csv\n/kaggle/input/sub-files-dlgenai/submission_f46.1.csv\n/kaggle/input/sub-files-dlgenai/submission_bert_3.csv\n/kaggle/input/sub-files-dlgenai/submission_distB_1.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/train.csv\n/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"df = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/train.csv\") #training set\ndt = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\")  #test set","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.725326Z","iopub.execute_input":"2025-11-27T14:51:35.725591Z","iopub.status.idle":"2025-11-27T14:51:35.783046Z","shell.execute_reply.started":"2025-11-27T14:51:35.725572Z","shell.execute_reply":"2025-11-27T14:51:35.782339Z"}},"outputs":[],"execution_count":3},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\nlabel_cols = ['anger','fear','joy','sadness','surprise']\n\nxtrain, xval, ytrain, yval = train_test_split(\n    df['text'],\n    df[label_cols].values,\n    test_size=0.2,\n    random_state=42,\n    shuffle=True\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:35.783712Z","iopub.execute_input":"2025-11-27T14:51:35.783915Z","iopub.status.idle":"2025-11-27T14:51:36.563927Z","shell.execute_reply.started":"2025-11-27T14:51:35.783889Z","shell.execute_reply":"2025-11-27T14:51:36.562024Z"}},"outputs":[],"execution_count":4},{"cell_type":"code","source":"# import wandb\n# wandb.login(key=os.environ.get(\"WB_TOKEN\"))\n# # wandb.init(project=\"22f3001086-t32025\", name = \"BERT+Classifier head\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.565603Z","iopub.execute_input":"2025-11-27T14:51:36.566097Z","iopub.status.idle":"2025-11-27T14:51:36.570291Z","shell.execute_reply.started":"2025-11-27T14:51:36.566068Z","shell.execute_reply":"2025-11-27T14:51:36.569665Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"# # !pip install -q transformers wandb\n\n# import os\n# import random\n# import numpy as np\n# import pandas as pd\n# import torch\n# import torch.nn as nn\n# from torch.utils.data import Dataset, DataLoader\n# from sklearn.model_selection import train_test_split\n# from sklearn.metrics import f1_score\n\n# from transformers import (\n#     DistilBertTokenizerFast,\n#     DistilBertModel,\n#     get_linear_schedule_with_warmup,\n# )\n# from torch.optim import AdamW   # use PyTorch AdamW, not transformers\n# print(\"Done!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.570942Z","iopub.execute_input":"2025-11-27T14:51:36.571131Z","iopub.status.idle":"2025-11-27T14:51:36.592661Z","shell.execute_reply.started":"2025-11-27T14:51:36.571114Z","shell.execute_reply":"2025-11-27T14:51:36.591781Z"}},"outputs":[],"execution_count":6},{"cell_type":"code","source":"# SEED = 42\n# random.seed(SEED)\n# np.random.seed(SEED)\n# torch.manual_seed(SEED)\n# torch.cuda.manual_seed_all(SEED)\n\n# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n# print(\"Device:\", device)\n\n# label_cols = ['anger','fear','joy','sadness','surprise']\n\n# config = {\n#     \"model_name\": \"distilbert-base-uncased\",\n#     \"max_length\": 128,\n#     \"batch_size\": 16,\n#     \"lr\": 1e-5,\n#     \"weight_decay\": 0.01,\n#     \"epochs\": 5,\n#     \"warmup_ratio\": 0.1,\n# }","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.593480Z","iopub.execute_input":"2025-11-27T14:51:36.593762Z","iopub.status.idle":"2025-11-27T14:51:36.622469Z","shell.execute_reply.started":"2025-11-27T14:51:36.593739Z","shell.execute_reply":"2025-11-27T14:51:36.621017Z"}},"outputs":[],"execution_count":7},{"cell_type":"code","source":"# tokenizer = DistilBertTokenizerFast.from_pretrained(config[\"model_name\"])\n\n# class EmotionDataset(Dataset):\n#     def __init__(self, texts, labels, tokenizer, max_length):\n#         self.texts = list(texts)\n#         self.labels = labels\n#         self.tokenizer = tokenizer\n#         self.max_length = max_length\n\n#     def __len__(self):\n#         return len(self.texts)\n\n#     def __getitem__(self, idx):\n#         text = str(self.texts[idx])\n#         encoding = self.tokenizer(\n#             text,\n#             truncation=True,\n#             padding=\"max_length\",\n#             max_length=self.max_length,\n#             return_tensors=\"pt\",\n#         )\n#         item = {k: v.squeeze(0) for k, v in encoding.items()}\n#         item[\"labels\"] = torch.tensor(self.labels[idx], dtype=torch.float32)\n#         return item\n\n# train_ds = EmotionDataset(xtrain, ytrain, tokenizer, config[\"max_length\"])\n# val_ds   = EmotionDataset(xval,   yval,   tokenizer, config[\"max_length\"])\n\n# train_loader = DataLoader(train_ds, batch_size=config[\"batch_size\"], shuffle=True)\n# val_loader   = DataLoader(val_ds,   batch_size=config[\"batch_size\"], shuffle=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.623711Z","iopub.execute_input":"2025-11-27T14:51:36.623979Z","iopub.status.idle":"2025-11-27T14:51:36.643341Z","shell.execute_reply.started":"2025-11-27T14:51:36.623956Z","shell.execute_reply":"2025-11-27T14:51:36.641731Z"}},"outputs":[],"execution_count":8},{"cell_type":"code","source":"# class DistilBertForMultiLabel(nn.Module):\n#     def __init__(self, model_name, num_labels, dropout=0.5):\n#         super().__init__()\n#         self.distilbert = DistilBertModel.from_pretrained(model_name)\n#         hidden_size = self.distilbert.config.hidden_size\n#         self.dropout = nn.Dropout(dropout)\n#         self.classifier = nn.Linear(hidden_size, num_labels)\n\n#     def forward(self, input_ids, attention_mask):\n#         outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)\n#         cls_hidden = outputs.last_hidden_state[:, 0, :]   # [CLS]-like token\n#         x = self.dropout(cls_hidden)\n#         logits = self.classifier(x)\n#         return logits\n\n# model = DistilBertForMultiLabel(config[\"model_name\"], num_labels=len(label_cols))\n# model.to(device)\n\n# criterion = nn.BCEWithLogitsLoss()\n# optimizer = AdamW(model.parameters(), lr=config[\"lr\"], weight_decay=config[\"weight_decay\"])","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.644220Z","iopub.execute_input":"2025-11-27T14:51:36.644437Z","iopub.status.idle":"2025-11-27T14:51:36.669101Z","shell.execute_reply.started":"2025-11-27T14:51:36.644421Z","shell.execute_reply":"2025-11-27T14:51:36.668362Z"}},"outputs":[],"execution_count":9},{"cell_type":"code","source":"# num_training_steps = config[\"epochs\"] * len(train_loader)\n# num_warmup_steps = int(config[\"warmup_ratio\"] * num_training_steps)\n\n# scheduler = get_linear_schedule_with_warmup(\n#     optimizer,\n#     num_warmup_steps=num_warmup_steps,\n#     num_training_steps=num_training_steps,\n# )\n\n# def evaluate(model, loader):\n#     model.eval()\n#     all_logits = []\n#     all_labels = []\n#     with torch.no_grad():\n#         for batch in loader:\n#             input_ids = batch[\"input_ids\"].to(device)\n#             attention_mask = batch[\"attention_mask\"].to(device)\n#             labels = batch[\"labels\"].to(device)\n\n#             logits = model(input_ids=input_ids, attention_mask=attention_mask)\n#             all_logits.append(logits.cpu())\n#             all_labels.append(labels.cpu())\n\n#     all_logits = torch.cat(all_logits)\n#     all_labels = torch.cat(all_labels)\n\n#     probs = torch.sigmoid(all_logits).numpy()\n#     preds = (probs >= 0.5).astype(int)\n#     y_true = all_labels.numpy()\n\n#     macro_f1 = f1_score(y_true, preds, average=\"macro\")\n#     return macro_f1","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.669860Z","iopub.execute_input":"2025-11-27T14:51:36.670110Z","iopub.status.idle":"2025-11-27T14:51:36.694247Z","shell.execute_reply.started":"2025-11-27T14:51:36.670089Z","shell.execute_reply":"2025-11-27T14:51:36.693111Z"}},"outputs":[],"execution_count":10},{"cell_type":"code","source":"# best_f1 = 0.0\n# wandb.init(project=\"22f3001086-t32025\", config=config, name = \"distilbert 1\")\n\n# for epoch in range(1, config[\"epochs\"] + 1):\n#     model.train()\n#     total_loss = 0.0\n\n#     for batch in train_loader:\n#         input_ids = batch[\"input_ids\"].to(device)\n#         attention_mask = batch[\"attention_mask\"].to(device)\n#         labels = batch[\"labels\"].to(device)\n\n#         optimizer.zero_grad()\n#         logits = model(input_ids=input_ids, attention_mask=attention_mask)\n#         loss = criterion(logits, labels)\n#         loss.backward()\n#         optimizer.step()\n#         scheduler.step()\n\n#         total_loss += loss.item() * input_ids.size(0)\n\n#     train_loss = total_loss / len(train_ds)\n#     val_f1 = evaluate(model, val_loader)\n\n#     print(f\"Epoch {epoch}: train_loss={train_loss:.4f} val_macro_f1={val_f1:.4f}\")\n\n#     wandb.log({\n#         \"epoch\": epoch,\n#         \"train_loss\": train_loss,\n#         \"val_macro_f1\": val_f1,\n#         \"learning_rate\": optimizer.param_groups[0][\"lr\"],\n#     })\n\n#     if val_f1 > best_f1:\n#         best_f1 = val_f1\n#         torch.save(model.state_dict(), \"distilbert_best 1.pt\")\n#         print(\"  -> new best model saved\")\n\n# wandb.finish()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.696233Z","iopub.execute_input":"2025-11-27T14:51:36.696646Z","iopub.status.idle":"2025-11-27T14:51:36.712255Z","shell.execute_reply.started":"2025-11-27T14:51:36.696626Z","shell.execute_reply":"2025-11-27T14:51:36.711609Z"}},"outputs":[],"execution_count":11},{"cell_type":"code","source":"# import re\n# import torch\n# import torch.nn as nn\n# import pandas as pd\n# import numpy as np\n# from transformers import DistilBertTokenizerFast, DistilBertModel\n\n# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n# print(\"Device:\", device)\n\n# label_cols = ['anger','fear','joy','sadness','surprise']\n# model_name = \"distilbert-base-uncased\"\n\n# tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n\n# class DistilBertForMultiLabel(nn.Module):\n#     def __init__(self, model_name, num_labels, dropout=0.3):\n#         super().__init__()\n#         self.distilbert = DistilBertModel.from_pretrained(model_name)\n#         hidden_size = self.distilbert.config.hidden_size\n#         self.dropout = nn.Dropout(dropout)\n#         self.classifier = nn.Linear(hidden_size, num_labels)\n\n#     def forward(self, input_ids, attention_mask):\n#         outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)\n#         cls_hidden = outputs.last_hidden_state[:, 0, :]\n#         x = self.dropout(cls_hidden)\n#         logits = self.classifier(x)\n#         return logits\n\n# model = DistilBertForMultiLabel(model_name, num_labels=len(label_cols)).to(device)\n# state_dict = torch.load(\n#     \"/kaggle/working/distilbert_best 1.pt\",  # your dataset path\n#     map_location=device,\n# )\n# model.load_state_dict(state_dict)\n# model.eval()\n\n# test_df = pd.read_csv(\"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\")\n\n# all_preds = []\n# batch_size = 32\n# max_length = 128\n\n# with torch.no_grad():\n#     for i in range(0, len(test_df), batch_size):\n#         texts = test_df[\"text\"].iloc[i:i+batch_size].tolist()\n#         enc = tokenizer(\n#             texts,\n#             truncation=True,\n#             padding=\"max_length\",\n#             max_length=max_length,\n#             return_tensors=\"pt\",\n#         )\n#         input_ids = enc[\"input_ids\"].to(device)\n#         attention_mask = enc[\"attention_mask\"].to(device)\n\n#         logits = model(input_ids=input_ids, attention_mask=attention_mask)\n#         probs = torch.sigmoid(logits).cpu().numpy()\n#         preds = (probs >= 0.5).astype(int)\n#         all_preds.append(preds)\n\n# all_preds = np.vstack(all_preds)\n# sub = pd.DataFrame(all_preds, columns=label_cols)\n# sub.insert(0, \"id\", test_df[\"id\"])\n# sub.to_csv(\"submission_distB_1.csv\", index=False)\n# sub.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:51:36.712938Z","iopub.execute_input":"2025-11-27T14:51:36.713169Z","iopub.status.idle":"2025-11-27T14:51:36.736737Z","shell.execute_reply.started":"2025-11-27T14:51:36.713150Z","shell.execute_reply":"2025-11-27T14:51:36.735673Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"import pandas as pd\nimport os\n\n# 1. Load some CSV from the input directory (example path)\ninput_path = \"/kaggle/input/sub-files-dlgenai/submission_distB_1.csv\"\ndf_sub = pd.read_csv(input_path)\n\n# Optional: quick sanity check\nprint(df_sub.head())\nprint(\"\\nColumns :\")\nprint(df_sub.columns)\n\n# 2. Save it as the competition submission file in the working/output dir\ndf_sub.to_csv(\"submission.csv\", index=False)\n\nprint(\"Done!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-11-27T14:53:14.946885Z","iopub.execute_input":"2025-11-27T14:53:14.947196Z","iopub.status.idle":"2025-11-27T14:53:14.969737Z","shell.execute_reply.started":"2025-11-27T14:53:14.947176Z","shell.execute_reply":"2025-11-27T14:53:14.968844Z"}},"outputs":[{"name":"stdout","text":"   id  anger  fear  joy  sadness  surprise\n0   0      1     1    0        1         0\n1   1      0     0    0        0         0\n2   2      1     1    0        0         1\n3   3      0     1    0        0         0\n4   4      0     1    0        0         1\n\nColumns :\nIndex(['id', 'anger', 'fear', 'joy', 'sadness', 'surprise'], dtype='object')\nDone!\n","output_type":"stream"}],"execution_count":14}]}