Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| # from gliner import GLiNER | |
| from datasets import load_dataset | |
| import evaluate | |
| import numpy as np | |
| import threading | |
| import time | |
| from peft import prepare_model_for_kbit_training | |
| from peft import LoraConfig, get_peft_model, TaskType | |
| import torch | |
| from torch.profiler import profile, record_function, ProfilerActivity | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments | |
| seqeval = evaluate.load("seqeval") | |
| # id2label = {0: "O"} | |
| # label2id = {"O": 0} | |
| # def build_id2label(examples): | |
| # for i, label in enumerate(examples["mbert_token_classes"]): | |
| # if label.startswith("I-") and label not in label2id: | |
| # current_len = len(id2label) | |
| # id2label[current_len] = label | |
| # label2id[label] = current_len | |
| print(f"Is CUDA available: {torch.cuda.is_available()}") | |
| # True | |
| if torch.cuda.is_available(): | |
| print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
| # Load the fine-tuned GLiNER model | |
| st.write('Loading the pretrained model ...') | |
| model_name = "iiiorg/piiranha-v1-detect-personal-information" | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| print(model) | |
| # Prepare model for LoRA training | |
| model.train() # model in evaluation mode (dropout modules are activated) | |
| # enable gradient check pointing | |
| model.gradient_checkpointing_enable() | |
| # enable quantized training | |
| model = prepare_model_for_kbit_training(model) | |
| # LoRA config | |
| config = LoraConfig( | |
| r=8, | |
| lora_alpha=32, | |
| target_modules=["query_proj"], | |
| lora_dropout=0.05, | |
| bias="none", | |
| task_type=TaskType.TOKEN_CLS | |
| ) | |
| # LoRA trainable version of model | |
| model = get_peft_model(model, config) | |
| print(model) | |
| # trainable parameter count | |
| model.print_trainable_parameters() | |
| # # print weights | |
| # pytorch_total_params = sum(p.numel() for p in model.parameters()) | |
| # torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| # print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}') | |
| if torch.cuda.is_available(): | |
| model = model.to("cuda") | |
| # Load data. | |
| raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]') | |
| # raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en")) | |
| raw_dataset = raw_dataset.train_test_split(test_size=0.2) | |
| print(raw_dataset) | |
| print(raw_dataset.column_names) | |
| # raw_dataset = raw_dataset.select_columns(["mbert_tokens"]) | |
| # raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens") | |
| # raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels") | |
| # inputs = tokenizer( | |
| # raw_dataset['train'][0]['mbert_tokens'], | |
| # truncation=True, | |
| # is_split_into_words=True) | |
| # print(inputs) | |
| # print(inputs.tokens()) | |
| # print(inputs.word_ids()) | |
| # Build label2id and id2label | |
| st.write("Building label mappings") | |
| label2id = model.config.label2id | |
| id2label = model.config.id2label | |
| # raw_dataset.map( | |
| # build_id2label, | |
| # batched=False) | |
| st.write("id2label: ", model.config.id2label) | |
| st.write("label2id: ", model.config.label2id) | |
| # function to align labels with tokens | |
| # --> special tokens: -100 label id (ignored by cross entropy), | |
| # --> if tokens are inside a word, replace 'B-' with 'I-' | |
| def align_labels_with_tokens(labels): | |
| aligned_label_ids = [] | |
| aligned_label_ids.append(-100) | |
| for i, label in enumerate(labels): | |
| if label.startswith("B-"): | |
| label = label.replace("B-", "I-") | |
| aligned_label_ids.append(label2id[label]) | |
| aligned_label_ids.append(-100) | |
| return aligned_label_ids | |
| # create tokenize function | |
| def tokenize_function(examples): | |
| # tokenize and truncate text. The examples argument would have already stripped | |
| # the train or test label. | |
| new_labels = [] | |
| inputs = tokenizer( | |
| examples['mbert_tokens'], | |
| is_split_into_words=True, | |
| padding=True, | |
| truncation=True, | |
| max_length=512) | |
| for _, labels in enumerate(examples['mbert_token_classes']): | |
| new_labels.append(align_labels_with_tokens(labels)) | |
| inputs["labels"] = new_labels | |
| return inputs | |
| # tokenize training and validation datasets | |
| tokenized_data = raw_dataset.map( | |
| tokenize_function, | |
| batched=True) | |
| # data collator | |
| data_collator = DataCollatorForTokenClassification(tokenizer) | |
| st.write(tokenized_data["train"][:2]["labels"]) | |
| import os | |
| # Print all CUDA environment variables | |
| for key, value in os.environ.items(): | |
| if "CUDA" in key.upper(): | |
| print(f"{key}={value}") | |
| def compute_metrics(eval_preds): | |
| logits, labels = eval_preds | |
| predictions = np.argmax(logits, axis=-1) | |
| # Remove ignored index (special tokens) and convert to labels | |
| true_labels = [[id2label[l] for l in label if l != -100] for label in labels] | |
| true_predictions = [ | |
| [id2label[p] for (p, l) in zip(prediction, label) if l != -100] | |
| for prediction, label in zip(predictions, labels) | |
| ] | |
| all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels) | |
| return { | |
| "precision": all_metrics["overall_precision"], | |
| "recall": all_metrics["overall_recall"], | |
| "f1": all_metrics["overall_f1"], | |
| "accuracy": all_metrics["overall_accuracy"], | |
| } | |
| # hyperparameters | |
| lr = 2e-4 | |
| batch_size = 4 | |
| num_epochs = 4 | |
| output_dir = "xia-lora-deberta-v2" | |
| # define training arguments | |
| training_args = TrainingArguments( | |
| output_dir= output_dir, | |
| learning_rate=lr, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=num_epochs, | |
| weight_decay=0.01, | |
| logging_strategy="epoch", | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| gradient_accumulation_steps=4, | |
| warmup_steps=2, | |
| fp16=True, | |
| optim="paged_adamw_8bit", | |
| ) | |
| # configure trainer | |
| trainer = Trainer( | |
| model=model, | |
| train_dataset=tokenized_data["train"], | |
| eval_dataset=tokenized_data["test"], | |
| args=training_args, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics | |
| ) | |
| # train model | |
| model.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
| trainer.train() | |
| # renable warnings | |
| model.config.use_cache = True | |
| st.write('Pushing model to huggingface') | |
| # Push model to huggingface | |
| hf_name = 'CarolXia' # your hf username or org name | |
| model_id = hf_name + "/" + output_dir | |
| model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"]) | |
| trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"]) | |