Spaces:
Build error
Build error
| from utils import * | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| import unicodedata | |
| import re | |
| import gradio | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| # Undesirable patterns within texts | |
| patterns = { | |
| 'CONCLUSIONS AND IMPLICATIONS':'', | |
| 'BACKGROUND AND PURPOSE':'', | |
| 'EXPERIMENTAL APPROACH':'', | |
| 'KEY RESULTS AEA':'', | |
| '©':'', | |
| '®':'', | |
| 'μ':'', | |
| '(C)':'', | |
| 'OBJECTIVE:':'', | |
| 'MATERIALS AND METHODS:':'', | |
| 'SIGNIFICANCE:':'', | |
| 'BACKGROUND:':'', | |
| 'RESULTS:':'', | |
| 'METHODS:':'', | |
| 'CONCLUSIONS:':'', | |
| 'AIM:':'', | |
| 'STUDY DESIGN:':'', | |
| 'CLINICAL RELEVANCE:':'', | |
| 'CONCLUSION:':'', | |
| 'HYPOTHESIS:':'', | |
| 'CLINICAL RELEVANCE:':'', | |
| 'Questions/Purposes:':'', | |
| 'Introduction:':'', | |
| 'PURPOSE:':'', | |
| 'PATIENTS AND METHODS:':'', | |
| 'FINDINGS:':'', | |
| 'INTERPRETATIONS:':'', | |
| 'FUNDING:':'', | |
| 'PROGRESS:':'', | |
| 'CONTEXT:':'', | |
| 'MEASURES:':'', | |
| 'DESIGN:':'', | |
| 'BACKGROUND AND OBJECTIVES:':'', | |
| '<p>':'', | |
| '</p>':'', | |
| '<<ETX>>':'', | |
| '+/-':'', | |
| } | |
| patterns = {x.lower():y for x,y in patterns.items()} | |
| class treat_text: | |
| def __init__(self, patterns): | |
| self.patterns = patterns | |
| def __call__(self,text): | |
| text = unicodedata.normalize("NFKD",str(text)) | |
| text = multiple_replace(self.patterns,text.lower()) | |
| text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text) | |
| text = re.sub('( +)',' ', text) | |
| text = re.sub('(, ,)|(,,)',',', text) | |
| text = re.sub('(%)|(per cent)',' percent', text) | |
| return text | |
| # Regex multiple replace function | |
| def multiple_replace(dict, text): | |
| # Building regex from dict keys | |
| regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys()))) | |
| # Substitution | |
| return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) | |
| treat_text_fun = treat_text(patterns) | |
| import sys | |
| sys.path.append('ML-SLRC/') | |
| path = 'ML-SLRC/' | |
| model_path = path + 'model.pt' | |
| info_path = path + 'Info.json' | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| # # carrega o modelo | |
| model = torch.load(model_path) | |
| # # carrega as meta informações do modelo treinado | |
| with open(info_path, 'r') as f: | |
| Info = json.load(f) | |
| import random | |
| from datetime import datetime | |
| rand_seed = 2003 | |
| # datetime object containing current date and time | |
| now = datetime.now() | |
| time_stamp = now.strftime("%d_%m_%Y_HR_%H_%M_%S") | |
| config = { | |
| "shots_per_class":8, | |
| "batch_size":4, | |
| "epochs":8, | |
| "learning_rate":5e-05, | |
| "weight_decay": 0.85, | |
| "rand_seed":rand_seed, | |
| 'pos_weight':3.5, | |
| 'p_incld': 0.2, | |
| 'p_excld': 0.01, | |
| } | |
| NAME = str(config['shots_per_class'])+'-shots-Learner' +'_'+ time_stamp | |
| num_workers = 0 | |
| val_batch = 100 | |
| p_included = 0.7 | |
| p_notincluded = 0.3 | |
| sample_valid = 300 | |
| gen_seed = torch.Generator().manual_seed(rand_seed) | |
| np.random.seed(rand_seed) | |
| torch.manual_seed(rand_seed) | |
| random.seed(rand_seed) | |
| def treat_data_input(data, etailment_txt): | |
| data_train = data.groupby('test').sample(frac=1) | |
| dataload_all = data.copy() | |
| dataload_all.test = dataload_all.test.replace({np.nan: 'NANN'}) | |
| dataset_train = SLR_DataSet(data=data_train, | |
| input= 'text', | |
| output='test', | |
| tokenizer= initializer_model_scibert.tokenizer, | |
| LABEL_MAP=LABEL_MAP, | |
| treat_text=treat_text_fun, | |
| etailment_txt=etailment_txt) | |
| dataset_remain = SLR_DataSet(data=dataload_all, | |
| input= 'text', | |
| output='test', | |
| tokenizer= initializer_model_scibert.tokenizer, | |
| LABEL_MAP=LABEL_MAP, | |
| treat_text=treat_text_fun, | |
| etailment_txt=etailment_txt) | |
| dataload_train = DataLoader(dataset_train, | |
| batch_size=config['batch_size'],drop_last=False, | |
| num_workers=num_workers) | |
| dataload_remain = DataLoader(dataset_remain, | |
| batch_size=200,drop_last=False, | |
| num_workers=num_workers) | |
| return dataload_train, dataload_remain, dataload_all | |
| import gc | |
| from torch.optim import Adam | |
| def treat_train_evaluate(dataload_train, dataload_remain): | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| model_few = deepcopy(model) | |
| model_few.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean', | |
| pos_weight=torch.FloatTensor([config['pos_weight']])) | |
| optimizer = Adam(model_few.parameters(), lr = config['learning_rate'], | |
| weight_decay = config['weight_decay']) | |
| model_few.to(device) | |
| model_few.train() | |
| trainlog = model_few.fit(optimizer=optimizer, | |
| scheduler = None, | |
| data_train_loader=dataload_train, | |
| epochs = config['epochs'], print_info = 1, metrics= False, | |
| log = None, metrics_print = False) | |
| (loss, features_out, (logits, outputs)) = model_few.evaluate(dataload_remain) | |
| return logits | |
| def treat_sort(dataload_all,logits): | |
| dataload_all['prediction'] = torch.sigmoid(logits) | |
| dataload_all = dataload_all.sort_values(by=['prediction'], ascending=False).reset_index(drop=True) | |
| dataload_all.to_excel("output.xlsx") | |
| def pipeline(data): | |
| # data = pd.read_csv(fil.name) | |
| data = pd.read_excel(data) | |
| dataload_train, dataload_remain, dataload_all = treat_data_input(data,"its a great text") | |
| logits = treat_train_evaluate(dataload_train, dataload_remain) | |
| treat_sort(dataload_all,logits) | |
| return "output.xlsx" | |
| import gradio as gr | |
| with gr.Blocks() as demo: | |
| fil = gr.File(label="input data") | |
| output = gr.File(label="output data") | |
| greet_btn = gr.Button("Rank") | |
| greet_btn.click(fn=pipeline, inputs=fil, outputs=output) | |
| demo.launch() |