Spaces:

CarolXia
/

lora-finetune

Sleeping

App Files Files Community

lora-finetune / app.py

CarolXia

initial commit

61ad5f0 about 1 year ago

raw

history blame contribute delete

6.62 kB

	import streamlit as st
	# from gliner import GLiNER
	from datasets import load_dataset
	import evaluate
	import numpy as np
	import threading
	import time
	from peft import prepare_model_for_kbit_training
	from peft import LoraConfig, get_peft_model, TaskType
	import torch
	from torch.profiler import profile, record_function, ProfilerActivity
	from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments


	seqeval = evaluate.load("seqeval")

	# id2label = {0: "O"}
	# label2id = {"O": 0}
	# def build_id2label(examples):
	# for i, label in enumerate(examples["mbert_token_classes"]):
	# if label.startswith("I-") and label not in label2id:
	# current_len = len(id2label)
	# id2label[current_len] = label
	# label2id[label] = current_len

	print(f"Is CUDA available: {torch.cuda.is_available()}")
	# True
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

	# Load the fine-tuned GLiNER model
	st.write('Loading the pretrained model ...')
	model_name = "iiiorg/piiranha-v1-detect-personal-information"
	model = AutoModelForTokenClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	print(model)

	# Prepare model for LoRA training
	model.train() # model in evaluation mode (dropout modules are activated)
	# enable gradient check pointing
	model.gradient_checkpointing_enable()

	# enable quantized training
	model = prepare_model_for_kbit_training(model)

	# LoRA config
	config = LoraConfig(
	r=8,
	lora_alpha=32,
	target_modules=["query_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type=TaskType.TOKEN_CLS
	)

	# LoRA trainable version of model
	model = get_peft_model(model, config)

	print(model)
	# trainable parameter count
	model.print_trainable_parameters()

	# # print weights
	# pytorch_total_params = sum(p.numel() for p in model.parameters())
	# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')

	if torch.cuda.is_available():
	model = model.to("cuda")

	# Load data.
	raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
	# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
	raw_dataset = raw_dataset.train_test_split(test_size=0.2)
	print(raw_dataset)
	print(raw_dataset.column_names)
	# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
	# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
	# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")

	# inputs = tokenizer(
	# raw_dataset['train'][0]['mbert_tokens'],
	# truncation=True,
	# is_split_into_words=True)
	# print(inputs)
	# print(inputs.tokens())
	# print(inputs.word_ids())

	# Build label2id and id2label
	st.write("Building label mappings")
	label2id = model.config.label2id
	id2label = model.config.id2label
	# raw_dataset.map(
	# build_id2label,
	# batched=False)

	st.write("id2label: ", model.config.id2label)
	st.write("label2id: ", model.config.label2id)

	# function to align labels with tokens
	# --> special tokens: -100 label id (ignored by cross entropy),
	# --> if tokens are inside a word, replace 'B-' with 'I-'
	def align_labels_with_tokens(labels):
	aligned_label_ids = []
	aligned_label_ids.append(-100)
	for i, label in enumerate(labels):
	if label.startswith("B-"):
	label = label.replace("B-", "I-")
	aligned_label_ids.append(label2id[label])
	aligned_label_ids.append(-100)
	return aligned_label_ids

	# create tokenize function
	def tokenize_function(examples):
	# tokenize and truncate text. The examples argument would have already stripped
	# the train or test label.
	new_labels = []
	inputs = tokenizer(
	examples['mbert_tokens'],
	is_split_into_words=True,
	padding=True,
	truncation=True,
	max_length=512)
	for _, labels in enumerate(examples['mbert_token_classes']):
	new_labels.append(align_labels_with_tokens(labels))

	inputs["labels"] = new_labels
	return inputs

	# tokenize training and validation datasets
	tokenized_data = raw_dataset.map(
	tokenize_function,
	batched=True)
	# data collator
	data_collator = DataCollatorForTokenClassification(tokenizer)

	st.write(tokenized_data["train"][:2]["labels"])

	import os

	# Print all CUDA environment variables
	for key, value in os.environ.items():
	if "CUDA" in key.upper():
	print(f"{key}={value}")

	def compute_metrics(eval_preds):
	logits, labels = eval_preds
	predictions = np.argmax(logits, axis=-1)

	# Remove ignored index (special tokens) and convert to labels
	true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
	true_predictions = [
	[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
	for prediction, label in zip(predictions, labels)
	]
	all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
	return {
	"precision": all_metrics["overall_precision"],
	"recall": all_metrics["overall_recall"],
	"f1": all_metrics["overall_f1"],
	"accuracy": all_metrics["overall_accuracy"],
	}

	# hyperparameters
	lr = 2e-4
	batch_size = 4
	num_epochs = 4
	output_dir = "xia-lora-deberta-v2"

	# define training arguments
	training_args = TrainingArguments(
	output_dir= output_dir,
	learning_rate=lr,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_epochs,
	weight_decay=0.01,
	logging_strategy="epoch",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	gradient_accumulation_steps=4,
	warmup_steps=2,
	fp16=True,
	optim="paged_adamw_8bit",
	)

	# configure trainer
	trainer = Trainer(
	model=model,
	train_dataset=tokenized_data["train"],
	eval_dataset=tokenized_data["test"],
	args=training_args,
	data_collator=data_collator,
	compute_metrics=compute_metrics
	)

	# train model
	model.config.use_cache = False # silence the warnings. Please re-enable for inference!
	trainer.train()

	# renable warnings
	model.config.use_cache = True

	st.write('Pushing model to huggingface')

	# Push model to huggingface
	hf_name = 'CarolXia' # your hf username or org name
	model_id = hf_name + "/" + output_dir
	model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
	trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])