import gradio as gr import joblib import os import re import emoji import demoji import numpy as np import nltk # Ensure both resources are downloaded nltk.download("punkt") nltk.download("punkt_tab") # ========================================================== # 📦 Load all models # ========================================================== vectorizer_en = joblib.load("tfidf_vectorizer_en.pkl") le_en = joblib.load("label_encoder_en.pkl") stacking_en = joblib.load("stacking_en.pkl") vectorizer_fa = joblib.load("tfidf_vectorizer_fa.pkl") le_fa = joblib.load("label_encoder_fa.pkl") stacking_fa = joblib.load("stacking_fa.pkl") # ========================================================== # 🧹 Text cleaning functions # ========================================================== import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from hazm import Normalizer, Lemmatizer as HazmLemmatizer, word_tokenize as hazm_tokenize nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") # English preprocess lemmatizer = WordNetLemmatizer() STOPWORDS = set(stopwords.words("english")) RE_URL = re.compile(r"http\S+|www\.\S+") RE_HTML = re.compile(r"<.*?>") RE_NONALPHA = re.compile(r"[^a-zA-Z\s]") def preprocess_english(text): text = str(text).lower() text = emoji.demojize(text) text = demoji.replace(text, "") text = RE_URL.sub(" ", text) text = RE_HTML.sub(" ", text) text = RE_NONALPHA.sub(" ", text) text = re.sub(r"\s+", " ", text).strip() tokens = word_tokenize(text) tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 2] return " ".join(tokens) # Persian preprocess normalizer = Normalizer() hazm_lemmatizer = HazmLemmatizer() RE_URL_FA = re.compile(r"http\S+|www\.\S+") RE_NONPERSIAN = re.compile(r"[^\u0600-\u06FFA-Za-z\s]") def preprocess_persian(text): text = str(text) text = normalizer.normalize(text) text = emoji.demojize(text) text = demoji.replace(text, "") text = RE_URL_FA.sub(" ", text) text = re.sub(r"@\w+|#\w+|\d+", " ", text) text = RE_NONPERSIAN.sub(" ", text) text = re.sub(r"\s+", " ", text).strip() tokens = hazm_tokenize(text) tokens = [hazm_lemmatizer.lemmatize(t) for t in tokens if len(t) > 1] return " ".join(tokens) # ========================================================== # 🔮 Prediction function # ========================================================== def predict_sentiment(comment, language): if language == "English": clean_text = preprocess_english(comment) X = vectorizer_en.transform([clean_text]) pred = stacking_en.predict(X)[0] probs = stacking_en.predict_proba(X)[0] classes = le_en.classes_ else: clean_text = preprocess_persian(comment) X = vectorizer_fa.transform([clean_text]) pred = stacking_fa.predict(X)[0] probs = stacking_fa.predict_proba(X)[0] classes = le_fa.classes_ result_str = f"🔹 **Predicted Sentiment:** {pred}\n\n" prob_table = "\n".join([f"{cls}: {round(p,3)}" for cls, p in zip(classes, probs)]) return f"🗣️ **Input:** {comment}\n\n{result_str}**Prediction Probabilities:**\n{prob_table}" # ========================================================== # 🎨 Gradio UI # ========================================================== lang_dropdown = gr.Dropdown(["English", "Persian"], label="Select Language", value="English") input_box = gr.Textbox(label="Enter your comment here") output_box = gr.Markdown() iface = gr.Interface( fn=predict_sentiment, inputs=[input_box, lang_dropdown], outputs=output_box, title="🌍 Multilingual Sentiment Analyzer (English + Persian)", description="Enter a comment in English or Persian to see the predicted sentiment and probabilities.", examples=[ ["I loved the show! It was amazing!", "English"], ["برنامه خیلی عالی بود و مجری هم خوب بود", "Persian"], ["It was an average episode, not too bad.", "English"], ] ) if __name__ == "__main__": iface.launch()