In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# 5cfcb5e8ef8458be6e85d57c45c7573477e2ad6a

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/train.csv") #training set
dt = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/test.csv")  #test set

In [3]:
from sklearn.model_selection import train_test_split

label_cols = ['anger','fear','joy','sadness','surprise']

xtrain, xval, ytrain, yval = train_test_split(
    df['text'],
    df[label_cols].values,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [4]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wdb_t = user_secrets.get_secret("WB_TOKEN")
# import wandb
# wandb.login(key=wdb_t)
# # wandb.init(project="22f3001086-t32025", name = "BERT+Classifier head")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvaishnavib[0m ([33mvaishnavib-iitm-jntuh-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
# !pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [13]:
# --- Imports and Setup ---

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm
import nlpaug.augmenter.word as naw
import nltk
import pandas as pd
import numpy as np
import wandb

nltk.download('averaged_perceptron_tagger')

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

class Config:
    MODELNAME = "j-hartmann/emotion-english-distilroberta-base"
    LEARNINGRATE = 2e-5
    BATCHSIZE = 32
    EPOCHS = 6
    MAXLEN = 128
    RANDOMSEED = 42
    DROPOUT = 0.1
    WEIGHTDECAY = 0.01
    WARMUPRATIO = 0.1
    OUTPUTDIM = 5  # Number of labels
    LABELCOLS = ["anger", "fear", "joy", "sadness", "surprise"]

CONFIG = Config()

torch.manual_seed(CONFIG.RANDOMSEED)
np.random.seed(CONFIG.RANDOMSEED)

# --- Data Loading and Splitting ---

df = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/train.csv")
df_test = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/test.csv")

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["text"], df[CONFIG.LABELCOLS].values,
    test_size=0.2,
    random_state=CONFIG.RANDOMSEED,
    shuffle=True
)

df_train = pd.DataFrame({"text": X_train})
df_train[CONFIG.LABELCOLS] = y_train

df_val = pd.DataFrame({"text": X_val})
df_val[CONFIG.LABELCOLS] = y_val

print(f"Training samples: {len(df_train)}")
print(f"Validation samples: {len(df_val)}")

# --- NLP Augmentation ---

syn_aug = naw.SynonymAug(aug_max=1, stopwords=["i", "am", "the", "a", "to", "is"])

def augment_sample(row, aug_count=1):
    new_rows = []
    text = row['text']
    for _ in range(aug_count):
        aug_text = syn_aug.augment(text)
        new_row = row.copy()
        new_row['text'] = aug_text
        new_rows.append(new_row)
    return pd.DataFrame(new_rows)

anger_samples = df_train[df_train["anger"] == 1]
joy_samples = df_train[df_train["joy"] == 1]

augmented_parts = []
for _, row in anger_samples.iterrows():
    augmented_parts.append(augment_sample(row, aug_count=2))
for _, row in joy_samples.iterrows():
    augmented_parts.append(augment_sample(row, aug_count=1))

if augmented_parts:
    df_aug = pd.concat(augmented_parts, ignore_index=True)
    df_train_aug = pd.concat([df_train, df_aug], ignore_index=True)
else:
    df_train_aug = df_train.copy()

print(f"Training size after augmentation: {len(df_train_aug)}")

# --- Positive Class Weights for BCE ---

# These class counts should correspond to training augmented set stats if possible
class_counts = {
    "anger": 808,
    "fear": 3860,
    "joy": 1660,
    "sadness": 2171,
    "surprise": 1999
}

n_train = len(df_train_aug)
pos_weights = []
for label in CONFIG.LABELCOLS:
    count = class_counts.get(label, 1)
    weight = max(1, int((n_train - count) * 0.8 / count))  # heuristic weighting
    pos_weights.append(weight)

POS_WEIGHTS = torch.tensor(pos_weights, dtype=torch.float).to(DEVICE)
print(f"Positive class weights for BCE: {POS_WEIGHTS.tolist()}")

# --- Dataset Class ---

class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, maxlen):
        self.texts = df["text"].values
        self.labels = df[CONFIG.LABELCOLS].values
        self.tokenizer = tokenizer
        self.maxlen = maxlen
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': labels
        }

tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODELNAME)
train_dataset = EmotionDataset(df_train_aug, tokenizer, CONFIG.MAXLEN)
val_dataset = EmotionDataset(df_val, tokenizer, CONFIG.MAXLEN)
train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCHSIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCHSIZE, shuffle=False, num_workers=2)

# --- Model Definition ---

class EmotionClassifier(nn.Module):
    def __init__(self, nclasses, modelname, dropout):
        super(EmotionClassifier, self).__init__()
        self.backbone = AutoModel.from_pretrained(modelname)
        self.drop = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, nclasses)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        dropped = self.drop(pooled_output)
        logits = self.classifier(dropped)
        return logits

model = EmotionClassifier(CONFIG.OUTPUTDIM, CONFIG.MODELNAME, CONFIG.DROPOUT).to(DEVICE)

# --- Threshold Utility Functions ---

def find_optimal_thresholds(y_true, y_probs, label_cols, num_thresholds=100):
    thresholds = {}
    for i, label in enumerate(label_cols):
        best_f1 = 0.0
        best_t = 0.5
        for t in np.linspace(0.0, 1.0, num_thresholds):
            y_pred = (y_probs[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], y_pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_t = t
        thresholds[label] = best_t
    return thresholds

def calculate_macro_f1(y_true, y_probs, thresholds, label_cols):
    y_pred = np.zeros_like(y_true)
    for i, label in enumerate(label_cols):
        t = thresholds.get(label, 0.5)
        y_pred[:, i] = (y_probs[:, i] >= t).astype(int)
    return f1_score(y_true, y_pred, average='macro', zero_division=0)

# --- Training Loop with WandB Tracking ---

wandb.init(project="22f3001086-t32025", name = "roberta-emotional", config=CONFIG.__dict__)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=POS_WEIGHTS)
optimizer = AdamW(model.parameters(), lr=CONFIG.LEARNINGRATE, weight_decay=CONFIG.WEIGHTDECAY)
total_steps = len(train_loader) * CONFIG.EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(total_steps*CONFIG.WARMUPRATIO),
                                            num_training_steps=total_steps)

best_macro_f1 = 0.0

for epoch in range(CONFIG.EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].cpu().numpy()

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_labels.append(labels)
            all_probs.append(probs)

    y_true = np.vstack(all_labels)
    y_probs = np.vstack(all_probs)
    optimal_thresholds = find_optimal_thresholds(y_true, y_probs, CONFIG.LABELCOLS)
    val_macro_f1 = calculate_macro_f1(y_true, y_probs, optimal_thresholds, CONFIG.LABELCOLS)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Macro F1: {val_macro_f1:.4f}")
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_macro_f1": val_macro_f1,
        "learning_rate": scheduler.get_last_lr()[0],
        **{f"threshold_{label}": thr for label, thr in optimal_thresholds.items()}
    })

    if val_macro_f1 > best_macro_f1:
        best_macro_f1 = val_macro_f1
        torch.save(model.state_dict(), "best_emotion_model.pt")
        print(f"Saved best model with Macro F1: {best_macro_f1:.4f}")

wandb.finish()
print("Training complete.")

# --- Inference Code ---

class EmotionTestDataset(Dataset):
    def __init__(self, texts, tokenizer, maxlen):
        self.texts = list(texts)
        self.tokenizer = tokenizer
        self.maxlen = maxlen
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }

def predict(model_path, test_texts, batch_size=64):
    tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODELNAME)
    model = EmotionClassifier(CONFIG.OUTPUTDIM, CONFIG.MODELNAME, CONFIG.DROPOUT)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()

    test_dataset = EmotionTestDataset(test_texts, tokenizer, CONFIG.MAXLEN)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    all_probs = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Generating Test Predictions"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)

    all_probs = np.vstack(all_probs)
    return all_probs

# Example usage:
# test_probs = predict("best_emotion_model.pt", df_test["text"])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Using device: cuda
Training samples: 5461
Validation samples: 1366
Training size after augmentation: 8094
Positive class weights for BCE: [7.0, 1.0, 3.0, 2.0, 2.0]


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Epoch 1 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 0.7785 | Val Macro F1: 0.7003
Saved best model with Macro F1: 0.7003


Epoch 2 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 0.5155 | Val Macro F1: 0.7440
Saved best model with Macro F1: 0.7440


Epoch 3 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f2a74111440>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
 Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f2a74111440>  
 Traceback (most recent call last):
    File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
     ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^    ^^if w.is_alive():
^^  ^ ^ ^  ^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

 ^    assert self._parent_pid == os.getpid(), 'can only test a child process'^
 ^  ^ ^ ^ ^^ ^^  ^^ 
   File "/usr

Epoch 3 | Train Loss: 0.4007 | Val Macro F1: 0.7712
Saved best model with Macro F1: 0.7712


Epoch 4 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 0.3265 | Val Macro F1: 0.7901
Saved best model with Macro F1: 0.7901


Epoch 5 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 0.2777 | Val Macro F1: 0.7921
Saved best model with Macro F1: 0.7921


Epoch 6 Training:   0%|          | 0/253 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 0.2466 | Val Macro F1: 0.7957
Saved best model with Macro F1: 0.7957


0,1
epoch,▁▂▄▅▇█
learning_rate,█▇▅▄▂▁
threshold_anger,▁▅▄▆█▆
threshold_fear,▅▆▂█▁▃
threshold_joy,█▁▅▁▂▂
threshold_sadness,▆▇▁▁▆█
threshold_surprise,▅▂▆▁▅█
train_loss,█▅▃▂▁▁
val_macro_f1,▁▄▆███

0,1
epoch,6.0
learning_rate,0.0
threshold_anger,0.78788
threshold_fear,0.42424
threshold_joy,0.58586
threshold_sadness,0.56566
threshold_surprise,0.65657
train_loss,0.24659
val_macro_f1,0.79567


Training complete.
