Spaces:

param2004
/

Medilingua-space

Sleeping

App Files Files Community

param2004 commited on Oct 7, 2025

Commit

690bcb6

verified ·

1 Parent(s): 1cc7aa6

Upload 17 files

Browse files

Files changed (13) hide show

app.py +139 -0
src/__pycache__/llm.cpython-313.pyc +0 -0
src/__pycache__/pdf_utils.cpython-313.pyc +0 -0
src/__pycache__/query_utils.cpython-313.pyc +0 -0
src/__pycache__/search.cpython-313.pyc +0 -0
src/__pycache__/ui.cpython-313.pyc +0 -0
src/__pycache__/utils.cpython-313.pyc +0 -0
src/llm.py +52 -0
src/pdf_utils.py +87 -0
src/query_utils.py +86 -0
src/search.py +271 -0
src/ui.py +124 -0
src/utils.py +94 -0

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import streamlit as st
+from src.utils import load_model, load_data
+from src.query_utils import QueryEnhancer
+from src.search import hybrid_search, set_description_texts, set_patient_texts, strong_recall_indices
+from src.ui import apply_custom_css, render_header, render_sidebar, render_chat_history, bot_typing_animation
+from src.pdf_utils import build_chat_pdf
+from src.llm import summarize_with_gemini
+import numpy as np
+def cosine_similarity(vec1, vec2):
+    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+def filter_similar_answers(indices, doctor_embeddings, threshold=0.88):
+    """
+    Filters out semantically similar answers based on cosine similarity.
+    Args:
+        indices (list[int]): candidate indices of top answers (assumed sorted by relevance)
+        doctor_embeddings (np.array): embeddings of all doctor's answers (num_samples x dim)
+        threshold (float): similarity above which answers are considered duplicates
+    Returns:
+        filtered_indices (list[int]): indices of diverse answers
+    """
+    if len(indices) == 0:
+        return []
+    filtered = [indices[0]]  # Always keep the first (most relevant) one
+    for idx in indices[1:]:
+        emb = doctor_embeddings[idx]
+        keep = True
+        for f_idx in filtered:
+            existing_emb = doctor_embeddings[f_idx]
+            sim = np.dot(emb, existing_emb)  # Cosine sim (since normalized)
+            if sim >= threshold:
+                keep = False
+                break
+        if keep:
+            filtered.append(idx)
+    return filtered
+# --- 1. Setup UI ---
+apply_custom_css()
+render_header()
+render_sidebar()
+# --- 2. Load Model & Data ---
+model = load_model()
+data = load_data()
+google_api_key = st.secrets.get("GOOGLE_API_KEY")
+if not data:
+    st.error("❌ Could not load dataset or embeddings. Please check your paths.")
+    st.stop()
+query_enhancer = QueryEnhancer(model)
+# --- 3. Set texts for recall ---
+set_description_texts(data['description_column'])
+set_patient_texts(data['patient_column'])
+# --- 4. Initialize Chat History ---
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+render_chat_history(st.session_state.messages)
+# --- 5. PDF export sidebar ---
+with st.sidebar:
+    st.markdown("---")
+    st.subheader("Export")
+    if st.session_state.get("messages"):
+        pdf_buffer = build_chat_pdf(
+            st.session_state.messages, title="MediLingua Chat Transcript"
+        )
+        if pdf_buffer:
+            st.download_button(
+                label="📄 Download chat as PDF",
+                data=pdf_buffer,
+                file_name="medilingua_chat.pdf",
+                mime="application/pdf",
+            )
+        else:
+            st.caption(
+                "Install `reportlab` to enable PDF export: pip install reportlab"
+            )
+# --- 6. Handle User Input ---
+if prompt := st.chat_input("What is your medical question?"):
+    # Show user input immediately
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Placeholder for bot response
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        # Step 1: Enhance query
+        enhanced_query = query_enhancer.enhance_query(prompt)
+        # Step 2: Strong recall + hybrid search
+        with st.spinner("🔍 Searching for top relevant answers..."):
+            # 1️⃣ Get top-k candidates
+            indices = strong_recall_indices(prompt, top_k=10)
+            if not indices or len(indices) < 3:
+                indices = hybrid_search(
+                    model,
+                    data['question_embeddings'],
+                    user_query_raw=prompt,
+                    user_query_enhanced=enhanced_query,
+                    top_k=10,
+                    weight_semantic=0.7,
+                    faiss_top_candidates=256,
+                    use_exact_match=True,
+                    use_fuzzy_match=True
+                )
+            # 2️⃣ Filter out semantically similar doctor answers to ensure diversity
+            indices = filter_similar_answers(indices, data['doctor_embeddings'], threshold=0.88)
+        # Step 3: Summarization / Gemini
+        if indices is not None and len(indices) > 0:
+            # Gather ALL filtered diverse answers for summarization (used as reference)
+            top_answers = [data['original_answers'][i] for i in indices]
+            combined_text = " ".join(top_answers)
+            summary = summarize_with_gemini(google_api_key, combined_text, prompt)
+            # Show only the AI's summarized answer (no doctor's notes displayed)
+            bot_typing_animation(message_placeholder, summary)
+            response = summary
+        else:
+            response = "⚕️ I couldn’t find any contextually similar answer in the dataset."
+            bot_typing_animation(message_placeholder, response)
+    # Save conversation
+    st.session_state.messages.append({"role": "assistant", "content": response})

src/__pycache__/llm.cpython-313.pyc ADDED Viewed

Binary file (3.77 kB). View file

src/__pycache__/pdf_utils.cpython-313.pyc ADDED Viewed

Binary file (3.46 kB). View file

src/__pycache__/query_utils.cpython-313.pyc ADDED Viewed

Binary file (4.41 kB). View file

src/__pycache__/search.cpython-313.pyc ADDED Viewed

Binary file (11.9 kB). View file

src/__pycache__/ui.cpython-313.pyc ADDED Viewed

Binary file (5.37 kB). View file

src/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (4.97 kB). View file

src/llm.py ADDED Viewed

	@@ -0,0 +1,52 @@

+def summarize_with_gemini(api_key, doctor_answer, user_question, max_retries=2):
+    import requests, json, time, streamlit as st
+    if not api_key:
+        st.warning("⚠️ Google API Key missing. Showing full answer instead.")
+        return doctor_answer if isinstance(doctor_answer, str) else "\n\n".join(doctor_answer)
+    combined_answer = "\n\n---\n\n".join(doctor_answer) if isinstance(doctor_answer, list) else doctor_answer
+    candidate_models = ["gemini-2.0-flash", "gemini-2.5-flash", "gemini-2.0-pro"]
+    for model_name in candidate_models:
+        prompt = f"""You are a professional AI medical assistant.
+Summarize the doctor's responses clearly, accurately, and concisely for the patient.
+Focus only on medically relevant information that directly answers the user's question.
+User's Question:
+"{user_question}"
+Doctor's Answer(s):
+"{combined_answer}"
+Instructions:
+- Provide a medically correct, patient-friendly summary in simple, clear language.
+- List multiple points as bullets if possible.
+- If the user's question lacks personal details (e.g., gender, age, weight), generate a generalized, gender-neutral summary.
+- Avoid gender-specific recommendations (e.g., consulting a gynecologist) unless the query explicitly mentions gender or related details.
+- Dont forget to mention potential next steps, treatments, or lifestyle changes if doctor's answer has it mentioned.
+- Always include a recommendation to consult a relevant doctor type (e.g., general practitioner, orthopedist) at the end of the summary, unless the doctor's answers already specify a consultation with a specific doctor type.
+- For example, for back pain, recommend consulting an orthopedist or general practitioner unless the query or doctor's answers suggest a more specific specialist.
+- If the doctor's response does not address the question, respond:
+  "There is no information related to your question in the doctor's answer, so I generated the best possible answer based on the information provided."""
+        url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={api_key}"
+        payload = {"contents": [{"parts": [{"text": prompt}]}]}
+        headers = {"Content-Type": "application/json"}
+        for attempt in range(max_retries):
+            try:
+                resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
+                resp.raise_for_status()
+                result = resp.json()
+                if "candidates" in result and result["candidates"]:
+                    return result["candidates"][0]["content"]["parts"][0]["text"].strip()
+            except requests.exceptions.HTTPError as e:
+                if resp.status_code == 404: break
+                time.sleep(1)
+            except Exception: time.sleep(1)
+    st.warning("⚕️ Could not generate summary. Showing original answer.")
+    return combined_answer

src/pdf_utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import io
+from typing import List, Dict
+import streamlit as st
+def _import_reportlab():
+    try:
+        from reportlab.lib.pagesizes import A4
+        from reportlab.lib.styles import getSampleStyleSheet
+        from reportlab.lib.units import mm
+        from reportlab.pdfgen import canvas
+        from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle
+        from reportlab.lib import colors
+        return {
+            "A4": A4,
+            "getSampleStyleSheet": getSampleStyleSheet,
+            "mm": mm,
+            "canvas": canvas,
+            "Paragraph": Paragraph,
+            "SimpleDocTemplate": SimpleDocTemplate,
+            "Spacer": Spacer,
+            "Table": Table,
+            "TableStyle": TableStyle,
+            "colors": colors,
+        }
+    except Exception:
+        return None
+def build_chat_pdf(messages: List[Dict[str, str]], title: str = "MediLingua Chat Transcript") -> io.BytesIO:
+    """
+    Create a PDF from chat messages and return as an in-memory BytesIO buffer.
+    Each message is a dict with keys: role ('user'|'assistant'), content (str).
+    """
+    libs = _import_reportlab()
+    if libs is None:
+        st.error(
+            "PDF generation library not found. Install with: `pip install reportlab` and rerun."
+        )
+        return None
+    buffer = io.BytesIO()
+    doc = libs["SimpleDocTemplate"](buffer, pagesize=libs["A4"], rightMargin=28, leftMargin=28, topMargin=36, bottomMargin=28)
+    styles = libs["getSampleStyleSheet"]()
+    elements = []
+    # Title
+    title_style = styles["Title"]
+    elements.append(libs["Paragraph"](title, title_style))
+    elements.append(libs["Spacer"](1, 12))
+    # Build a table-like layout for messages
+    data = []
+    table_style_cmds = [
+        ("VALIGN", (0, 0), (-1, -1), "TOP"),
+        ("INNERGRID", (0, 0), (-1, -1), 0.25, libs["colors"].lightgrey),
+        ("BOX", (0, 0), (-1, -1), 0.25, libs["colors"].lightgrey),
+        ("LEFTPADDING", (0, 0), (-1, -1), 6),
+        ("RIGHTPADDING", (0, 0), (-1, -1), 6),
+        ("TOPPADDING", (0, 0), (-1, -1), 6),
+        ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+    ]
+    role_style = styles["Heading5"]
+    body_style = styles["BodyText"]
+    for msg in messages:
+        role = msg.get("role", "").capitalize()
+        content = msg.get("content", "")
+        # Left column: role, Right column: content paragraph
+        role_par = libs["Paragraph"](f"<b>{role}</b>", role_style)
+        content_par = libs["Paragraph"](content.replace("\n", "<br/>"), body_style)
+        data.append([role_par, content_par])
+    table = libs["Table"](data, colWidths=[30 * libs["mm"], None])
+    table.setStyle(libs["TableStyle"](table_style_cmds))
+    elements.append(table)
+    doc.build(elements)
+    buffer.seek(0)
+    return buffer

src/query_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import re
+import nltk
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from keybert import KeyBERT
+# --- Download NLTK resources if needed ---
+try:
+    stopwords.words('english')
+except LookupError:
+    nltk.download('stopwords', quiet=True)
+    nltk.download('punkt', quiet=True)
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+    nltk.download('wordnet', quiet=True)
+# --- Initialize tools ---
+tokenizer = RegexpTokenizer(r'\w+')
+lemmatizer = WordNetLemmatizer()
+custom_stopwords = set(stopwords.words('english')) - {'no', 'not', 'without', 'due', 'to', 'with', 'on', 'in'}
+# --- Medical synonym expansion ---
+medical_synonyms = {
+    "flu": ["influenza"],
+    "cold": ["common cold", "rhinitis"],
+    "heart attack": ["myocardial infarction"],
+    "diabetes": ["high blood sugar", "hyperglycemia"],
+    "bp": ["blood pressure", "hypertension"],
+    "hypertension": ["high blood pressure"],
+    "asthma": ["respiratory disease"],
+    "cough": ["dry cough", "wet cough"],
+    "fever": ["temperature", "high fever"]
+}
+def expand_medical_terms(text: str) -> str:
+    """Expands known medical terms with their synonyms for better recall."""
+    for key, syns in medical_synonyms.items():
+        for syn in syns:
+            text = re.sub(rf"\b{key}\b", f"{key} {syn}", text, flags=re.IGNORECASE)
+    return text
+def preprocess_text(text: str) -> str:
+    """Minimal preprocessing: lowercase, remove punctuation, collapse spaces."""
+    text = str(text).lower()
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+class QueryEnhancer:
+    """
+    Wrapper class to handle query enhancement with local SapBERT + KeyBERT.
+    """
+    def __init__(self, sentence_transformer_model):
+        """
+        sentence_transformer_model: the already-loaded local SapBERT SentenceTransformer
+        """
+        self.kw_model = KeyBERT(sentence_transformer_model)
+    def extract_keywords(self, text: str, top_n: int = 5) -> list:
+        """Extracts top keywords using KeyBERT."""
+        if not self.kw_model:
+            return []
+        try:
+            keywords = self.kw_model.extract_keywords(
+                text,
+                keyphrase_ngram_range=(1, 2),
+                stop_words='english',
+                top_n=top_n
+            )
+            return [kw[0] for kw in keywords]
+        except Exception:
+            return []
+    def enhance_query(self, user_query: str) -> str:
+        """
+        Full query enhancement pipeline:
+        - Preprocess text
+        - Expand medical synonyms
+        - Extract keywords
+        - Return combined enhanced query string
+        """
+        preprocessed = preprocess_text(user_query)
+        expanded = expand_medical_terms(preprocessed)
+        keywords = self.extract_keywords(user_query)
+        enhanced_query = f"{expanded} {' '.join(keywords)}".strip()
+        return enhanced_query

src/search.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+import faiss
+import re
+# --- Global caches ---
+tfidf_vectorizer = None
+tfidf_matrix = None
+corpus_texts = None
+faiss_index = None
+embeddings_array = None  # FAISS requires float32
+description_texts = None  # For exact/fuzzy match
+patient_texts = None      # For exact/fuzzy match
+description_norm_texts = None  # Normalized (punctuation stripped)
+patient_norm_texts = None      # Normalized (punctuation stripped)
+def encode_question(model, user_question):
+    """Encodes the user's question using the embedding model."""
+    if model is None or not user_question.strip():
+        return None
+    return model.encode([user_question], show_progress_bar=False)[0].astype('float32')
+def init_tfidf(data_texts):
+    """
+    Initialize TF-IDF matrix for hybrid search.
+    """
+    global tfidf_vectorizer, tfidf_matrix, corpus_texts
+    corpus_texts = data_texts
+    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
+    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_texts)
+def init_faiss(embeddings):
+    """
+    Initialize FAISS index for fast semantic search.
+    embeddings: np.array (num_samples x 768) normalized
+    """
+    global faiss_index, embeddings_array
+    embeddings_array = embeddings.astype('float32')
+    # Do not renormalize
+    dimension = embeddings_array.shape[1]
+    faiss_index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
+    faiss_index.add(embeddings_array)
+def set_description_texts(texts):
+    """
+    Provide the Description column for exact/fuzzy match search.
+    """
+    global description_texts, description_norm_texts
+    description_texts = [str(t).lower() for t in texts]
+    description_norm_texts = [preprocess_text_for_embeddings(t) for t in texts]
+def set_patient_texts(texts):
+    """
+    Provide the Patient column for exact/fuzzy match search.
+    """
+    global patient_texts, patient_norm_texts
+    patient_texts = [str(t).lower() for t in texts]
+    patient_norm_texts = [preprocess_text_for_embeddings(t) for t in texts]
+def strong_recall_indices(user_query_raw: str, top_k: int = 10):
+    """
+    Scan the entire dataset (Description + Patient) for:
+    1) Exact equality on normalized text
+    2) Exact substring presence
+    3) High-threshold fuzzy match (if rapidfuzz available)
+    Returns a list of indices (unique, in priority order) up to top_k.
+    """
+    global description_texts, patient_texts, description_norm_texts, patient_norm_texts
+    if not user_query_raw:
+        return []
+    q_lower = str(user_query_raw).lower()
+    q_norm = preprocess_text_for_embeddings(user_query_raw)
+    N_desc = len(description_texts) if description_texts is not None else 0
+    N_pat = len(patient_texts) if patient_texts is not None else 0
+    N = max(N_desc, N_pat)
+    if N == 0:
+        return []
+    exact_equal = []
+    exact_sub = []
+    fuzzy_hits = []
+    # 1) Exact equality on normalized text
+    if description_norm_texts is not None:
+        exact_equal += [i for i in range(len(description_norm_texts)) if description_norm_texts[i] == q_norm]
+    if patient_norm_texts is not None:
+        exact_equal += [i for i in range(len(patient_norm_texts)) if patient_norm_texts[i] == q_norm]
+    # Deduplicate preserving order
+    seen = set()
+    ordered = []
+    for i in exact_equal:
+        if i not in seen:
+            seen.add(i)
+            ordered.append(i)
+    if len(ordered) >= top_k:
+        return ordered[:top_k]
+    # 2) Exact substring presence (lowercased)
+    if description_texts is not None:
+        exact_sub += [i for i in range(len(description_texts)) if q_lower in description_texts[i]]
+    if patient_texts is not None:
+        exact_sub += [i for i in range(len(patient_texts)) if q_lower in patient_texts[i]]
+    for i in exact_sub:
+        if i not in seen:
+            seen.add(i)
+            ordered.append(i)
+    if len(ordered) >= top_k:
+        return ordered[:top_k]
+    # 3) High-threshold fuzzy matches
+    try:
+        from rapidfuzz import fuzz
+        # Use partial_ratio and token_set_ratio; take max as score
+        scored = []
+        for i in range(N):
+            s_desc = description_texts[i] if (description_texts is not None and i < len(description_texts)) else ""
+            s_pat = patient_texts[i] if (patient_texts is not None and i < len(patient_texts)) else ""
+            score_desc = max(fuzz.partial_ratio(q_lower, s_desc), fuzz.token_set_ratio(q_lower, s_desc)) if s_desc else 0
+            score_pat = max(fuzz.partial_ratio(q_lower, s_pat), fuzz.token_set_ratio(q_lower, s_pat)) if s_pat else 0
+            score = max(score_desc, score_pat)
+            if score >= 90:
+                scored.append((i, score))
+        # sort by score desc
+        scored.sort(key=lambda x: x[1], reverse=True)
+        for i, _ in scored:
+            if i not in seen:
+                seen.add(i)
+                ordered.append(i)
+            if len(ordered) >= top_k:
+                break
+    except Exception:
+        pass
+    return ordered[:top_k]
+def hybrid_search(
+    model,
+    embeddings,
+    user_query_raw,
+    user_query_enhanced,
+    top_k=5,
+    weight_semantic=0.7,
+    faiss_top_candidates=256,
+    use_exact_match=True,
+    use_fuzzy_match=True
+):
+    """
+    Hybrid search combining:
+    1. FAISS semantic similarity
+    2. TF-IDF boosting
+    3. Optional exact substring match in Description
+    Returns: list of top indices in dataset
+    """
+    global tfidf_vectorizer, tfidf_matrix, corpus_texts, faiss_index, embeddings_array, description_texts
+    if model is None or embeddings is None or len(embeddings) == 0:
+        return []
+    # Encode enhanced query for semantic/TF-IDF stages
+    question_embedding = encode_question(model, user_query_enhanced)
+    if question_embedding is None:
+        return []
+    # --- 1. FAISS semantic search ---
+    if faiss_index is not None:
+        D, I = faiss_index.search(np.array([question_embedding]), k=min(faiss_top_candidates, embeddings.shape[0]))
+        top_candidates = I[0]
+        semantic_sim_top = D[0]
+    else:
+        semantic_sim_full = np.dot(embeddings, question_embedding)
+        top_candidates = np.argpartition(semantic_sim_full, -faiss_top_candidates)[-faiss_top_candidates:]
+        top_candidates = top_candidates[np.argsort(semantic_sim_full[top_candidates])[::-1]]
+        semantic_sim_top = semantic_sim_full[top_candidates]
+    # --- 2. TF-IDF similarity ---
+    if tfidf_vectorizer is not None and tfidf_matrix is not None:
+        tfidf_vec = tfidf_vectorizer.transform([user_query_enhanced])
+        tfidf_sim_top = (tfidf_matrix[top_candidates] @ tfidf_vec.T).toarray().ravel()
+    else:
+        tfidf_sim_top = np.zeros(len(top_candidates))
+    # --- 3. Optional exact + fuzzy match across Description & Patient ---
+    combined_sim_top = weight_semantic * semantic_sim_top + (1 - weight_semantic) * tfidf_sim_top
+    if use_exact_match or use_fuzzy_match:
+        query_lower = user_query_raw.lower()
+        # Exact substring presence boosts
+        exact_desc = np.zeros(len(top_candidates))
+        exact_pat = np.zeros(len(top_candidates))
+        if description_texts is not None:
+            exact_desc = np.array([1.0 if query_lower in description_texts[i] else 0.0 for i in top_candidates])
+        if patient_texts is not None:
+            exact_pat = np.array([1.0 if query_lower in patient_texts[i] else 0.0 for i in top_candidates])
+        # Fuzzy partial ratio via rapidfuzz (graceful fallback)
+        fuzzy_desc = np.zeros(len(top_candidates))
+        fuzzy_pat = np.zeros(len(top_candidates))
+        if use_fuzzy_match:
+            try:
+                from rapidfuzz import fuzz
+                if description_texts is not None:
+                    fuzzy_desc = np.array([
+                        fuzz.partial_ratio(query_lower, description_texts[i]) / 100.0 for i in top_candidates
+                    ])
+                if patient_texts is not None:
+                    fuzzy_pat = np.array([
+                        fuzz.partial_ratio(query_lower, patient_texts[i]) / 100.0 for i in top_candidates
+                    ])
+            except Exception:
+                pass
+        # Token overlap (Jaccard) as an additional weak signal
+        def jaccard(a: str, b: str) -> float:
+            sa = set(a.split())
+            sb = set(b.split())
+            if not sa or not sb:
+                return 0.0
+            inter = len(sa & sb)
+            union = len(sa | sb)
+            return inter / union if union else 0.0
+        token_desc = np.zeros(len(top_candidates))
+        token_pat = np.zeros(len(top_candidates))
+        if description_texts is not None:
+            token_desc = np.array([jaccard(query_lower, description_texts[i]) for i in top_candidates])
+        if patient_texts is not None:
+            token_pat = np.array([jaccard(query_lower, patient_texts[i]) for i in top_candidates])
+        # Combine boosters with gentle weights; exact match is strongest
+        booster = 0.20 * exact_desc + 0.20 * exact_pat + 0.10 * fuzzy_desc + 0.10 * fuzzy_pat + 0.05 * token_desc + 0.05 * token_pat
+        combined_sim_top = combined_sim_top + booster
+    # --- 4. Select final top-k indices ---
+    sorted_top_indices = top_candidates[np.argsort(combined_sim_top)[::-1][:top_k]]
+    return sorted_top_indices
+# --- Minimal preprocessing for embeddings ---
+def preprocess_text_for_embeddings(text: str) -> str:
+    """Lowercase + remove punctuation for embeddings."""
+    text = str(text).lower()
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# --- Minimal preprocessing for keywords ---
+def preprocess_text_for_keywords(text: str) -> str:
+    """Lowercase + remove punctuation for keywords."""
+    text = str(text).lower()
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text

src/ui.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import streamlit as st
+import time
+def apply_custom_css():
+    """Applies custom CSS for proper left-right chat alignment with enhanced colors."""
+    css = """
+    <style>
+        .stApp {
+            background-color: #0f172a;
+            color: #e2e8f0;
+            font-family: 'Inter', sans-serif;
+        }
+        .block-container {
+            max-width: 900px;
+            margin: auto;
+            padding-top: 1rem;
+        }
+        /* Chat message layout override */
+        [data-testid="stChatMessage"] {
+            display: flex !important;
+            align-items: flex-start !important;
+            margin-bottom: 0.75rem;
+        }
+        [data-testid="stChatMessage"] > div[data-testid="stMarkdownContainer"] {
+            padding: 0.8rem 1rem;
+            border-radius: 16px;
+            max-width: 70%;
+            line-height: 1.5;
+            font-size: 0.95rem;
+            word-wrap: break-word;
+            box-shadow: 0 4px 12px rgba(0,0,0,0.25);
+            transition: all 0.2s ease-in-out;
+            animation: fadeIn 0.3s ease;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(4px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        /* Assistant (left) */
+        [data-testid="stChatMessage"]:has(.stChatMessageContent[data-testid="assistant"]) {
+            justify-content: flex-start !important;
+        }
+        [data-testid="stChatMessage"]:has(.stChatMessageContent[data-testid="assistant"])
+        > div[data-testid="stMarkdownContainer"] {
+            background-color: #1e293b;
+            color: #f1f5f9;
+            border: 1px solid #334155;
+            text-align: left;
+        }
+        /* User (right) */
+        [data-testid="stChatMessage"]:has(.stChatMessageContent[data-testid="user"]) {
+            justify-content: flex-end !important;
+        }
+        [data-testid="stChatMessage"]:has(.stChatMessageContent[data-testid="user"])
+        > div[data-testid="stMarkdownContainer"] {
+            background-color: #2563eb;
+            color: white;
+            border: 1px solid #1d4ed8;
+            text-align: right;
+        }
+        /* Expander (doctor notes) */
+        .streamlit-expanderHeader {
+            background: #111827;
+            color: #cbd5e1;
+            border: 1px solid #374151;
+            border-radius: 10px;
+        }
+        .streamlit-expanderContent {
+            background: #0b1220;
+            border-left: 2px solid #334155;
+        }
+        /* Scrollbar style */
+        ::-webkit-scrollbar { width: 8px; }
+        ::-webkit-scrollbar-thumb { background-color: #334155; border-radius: 10px; }
+        /* Header/title */
+        h1 {
+            text-align: center;
+            color: #60a5fa;
+            font-weight: 600;
+        }
+        p[style*='text-align: center;'] {
+            color: #94a3b8;
+        }
+    </style>
+    """
+    st.markdown(css, unsafe_allow_html=True)
+def render_header():
+    st.title("🤖 MediLingua: Your Medical Assistant")
+    st.markdown(
+        "<p style='text-align: center; font-size: 1.1rem;'>Ask medical questions and get summarized answers from real doctor responses.</p>",
+        unsafe_allow_html=True
+    )
+def render_sidebar():
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        if st.secrets.get("GOOGLE_API_KEY"):
+            st.success("✅ Google API Key configured.")
+        else:
+            st.error("❌ Missing API Key in `.streamlit/secrets.toml`.")
+        st.markdown("---")
+        st.markdown("💡 Built with **Streamlit** & **Gemini**.")
+def render_chat_history(messages):
+    """Render previous messages."""
+    for message in messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+def bot_typing_animation(message_placeholder, final_text, delay=0.02):
+    """
+    Simulate bot typing animation in chat.
+    """
+    message_placeholder.markdown("")  # Empty initially
+    displayed = ""
+    for char in final_text:
+        displayed += char
+        message_placeholder.markdown(displayed)
+        time.sleep(delay)

src/utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import streamlit as st
+import pickle
+import pandas as pd
+from sentence_transformers import SentenceTransformer, models
+import torch
+import numpy as np
+from src.search import init_faiss
+from huggingface_hub import hf_hub_download
+# Repo IDs
+DATASET_REPO = "param2004/Medilingua-dataset"
+MODEL_REPO = "param2004/Medilingua-model"
+@st.cache_resource
+def load_model():
+    """Load SapBERT dynamically from Hugging Face Hub"""
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    st.info(f"🔬 Loading SapBERT from Hugging Face Hub on {device.upper()}...")
+    # Download model files dynamically
+    try:
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename="models/SapBERT-from-PubMedBERT-fulltext/pytorch_model.bin"
+        )
+        # Load SentenceTransformer as before
+        word_embedding_model = models.Transformer(model_path)
+        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)
+        st.success("✅ SapBERT loaded successfully from Hub.")
+    except Exception as e:
+        st.error(f"❌ Failed to load SapBERT from Hub. Details: {e}")
+        st.warning("⚠️ Falling back to 'all-MiniLM-L6-v2' model.")
+        model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
+    return model
+@st.cache_resource
+def load_data():
+    """Load embeddings and dataset dynamically from Hugging Face Hub"""
+    try:
+        # Download embeddings
+        question_emb_path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename="dataset/question_embeddings.pkl"
+        )
+        doctor_emb_path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename="dataset/doctor_embeddings.pkl"
+        )
+        dataset_csv_path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename="dataset/dataset.csv"
+        )
+        # Load embeddings
+        with open(question_emb_path, 'rb') as f:
+            question_data = pickle.load(f)
+        question_embeddings = question_data.get('embeddings').astype('float32')
+        with open(doctor_emb_path, 'rb') as f:
+            doctor_data = pickle.load(f)
+        doctor_embeddings = doctor_data.get('embeddings').astype('float32')
+        # Load CSV
+        df = pd.read_csv(dataset_csv_path)
+        df.dropna(subset=['Description', 'Patient', 'Doctor'], inplace=True)
+        df.drop_duplicates(inplace=True)
+        num_samples = min(len(df), len(question_embeddings), len(doctor_embeddings))
+        df = df.iloc[:num_samples]
+        question_embeddings = question_embeddings[:num_samples]
+        doctor_embeddings = doctor_embeddings[:num_samples]
+        st.success(f"✅ Loaded {num_samples} rows with SapBERT embeddings ({question_embeddings.shape[1]} dims)")
+        # Initialize FAISS
+        init_faiss(question_embeddings)
+        return {
+            "question_embeddings": question_embeddings,
+            "doctor_embeddings": doctor_embeddings,
+            "description_column": df["Description"].tolist(),
+            "patient_column": df["Patient"].tolist(),
+            "original_answers": df["Doctor"].tolist(),
+        }
+    except Exception as e:
+        st.error(f"❌ Error loading dataset or embeddings: {e}")
+        return None