Spaces:
Sleeping
Sleeping
| # β SmartManuals-AI App for Hugging Face Spaces | |
| # Full app.py with spaCy-based sentence segmentation and model dropdown selection | |
| import io | |
| import os | |
| import json | |
| import fitz # PyMuPDF | |
| import chromadb | |
| import torch | |
| import docx | |
| import gradio as gr | |
| import pytesseract | |
| import numpy as np | |
| import spacy | |
| from tqdm import tqdm | |
| from PIL import Image | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from sentence_transformers import SentenceTransformer, util | |
| # --------------------------- | |
| # βοΈ Configuration | |
| # --------------------------- | |
| MANUALS_DIR = "./Manuals" | |
| CHROMA_PATH = "./chroma_store" | |
| CHROMA_COLLECTION = "manual_chunks" | |
| CHUNK_SIZE = 750 | |
| CHUNK_OVERLAP = 100 | |
| EMBED_MODEL = "all-MiniLM-L6-v2" | |
| DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct" | |
| AVAILABLE_MODELS = [ | |
| "meta-llama/Llama-3-8B-Instruct", | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| "google/gemma-1.1-7b-it", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "Qwen/Qwen1.5-7B-Chat" | |
| ] | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # --------------------------- | |
| # π Load NLP model for sentence splitting | |
| # --------------------------- | |
| try: | |
| import spacy | |
| nlp = spacy.load("en_core_web_sm") | |
| except: | |
| os.system("python -m spacy download en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| def split_sentences(text): | |
| return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()] | |
| # --------------------------- | |
| # π§Ή Text cleanup | |
| # --------------------------- | |
| def clean(text): | |
| return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) | |
| # --------------------------- | |
| # π PDF and DOCX extractors | |
| # --------------------------- | |
| def extract_pdf_text(path): | |
| doc = fitz.open(path) | |
| pages = [] | |
| for i, page in enumerate(doc): | |
| text = page.get_text() | |
| if not text.strip(): | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
| text = pytesseract.image_to_string(img) | |
| pages.append((i + 1, text)) | |
| return pages | |
| def extract_docx_text(path): | |
| doc = docx.Document(path) | |
| full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) | |
| return [(1, full_text)] | |
| # --------------------------- | |
| # π¦ Chunk splitter | |
| # --------------------------- | |
| def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP): | |
| chunks = [] | |
| current = [] | |
| length = 0 | |
| for s in sentences: | |
| tokens = len(s.split()) | |
| if length + tokens > max_tokens: | |
| chunks.append(" ".join(current)) | |
| current = current[-overlap:] | |
| length = sum(len(w.split()) for w in current) | |
| current.append(s) | |
| length += tokens | |
| if current: | |
| chunks.append(" ".join(current)) | |
| return chunks | |
| # --------------------------- | |
| # π Metadata from file | |
| # --------------------------- | |
| def extract_meta(name): | |
| name = name.lower() | |
| return { | |
| "model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"), | |
| "doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"), | |
| "brand": "life fitness" | |
| } | |
| # --------------------------- | |
| # π Embed and store chunks | |
| # --------------------------- | |
| def embed_all(): | |
| embedder = SentenceTransformer(EMBED_MODEL) | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| try: | |
| client.delete_collection(CHROMA_COLLECTION) | |
| except: | |
| pass | |
| db = client.create_collection(CHROMA_COLLECTION) | |
| for fname in os.listdir(MANUALS_DIR): | |
| path = os.path.join(MANUALS_DIR, fname) | |
| if fname.endswith(".pdf"): | |
| pages = extract_pdf_text(path) | |
| elif fname.endswith(".docx"): | |
| pages = extract_docx_text(path) | |
| else: | |
| continue | |
| meta = extract_meta(fname) | |
| for page, text in pages: | |
| sents = split_sentences(clean(text)) | |
| chunks = chunkify(sents) | |
| for i, chunk in enumerate(chunks): | |
| db.add( | |
| ids=[f"{fname}::p{page}::c{i}"], | |
| documents=[chunk], | |
| metadatas=[{**meta, "source": fname, "page": page}] | |
| ) | |
| return db, embedder | |
| # --------------------------- | |
| # π€ Load selected LLM model | |
| # --------------------------- | |
| def load_model(repo): | |
| tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN | |
| ) | |
| return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
| # --------------------------- | |
| # π₯ Retrieval-Augmented QA | |
| # --------------------------- | |
| def answer_query(q, model_choice): | |
| results = db.query(query_texts=[q], n_results=3) | |
| context = "\n\n".join(results["documents"][0]) | |
| prompt = f""" | |
| You are a helpful assistant. Answer based on the context. If unsure, say "I don't know". | |
| Context: | |
| {context} | |
| Question: {q} | |
| Answer: | |
| """ | |
| pipe = load_model(model_choice) | |
| out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"] | |
| return out.split("Answer:")[-1].strip() | |
| # --------------------------- | |
| # π Initialize app | |
| # --------------------------- | |
| print("Embedding documents...") | |
| db, embedder = embed_all() | |
| print("Done embedding.") | |
| # --------------------------- | |
| # ποΈ Gradio UI | |
| # --------------------------- | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown("""# π§ SmartManuals-AI | |
| Ask any question and let the model answer from your uploaded manuals. | |
| """) | |
| with gr.Row(): | |
| qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?") | |
| model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL) | |
| ansbox = gr.Textbox(label="Answer", lines=10) | |
| btn = gr.Button("π Submit") | |
| btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox) | |
| demo.launch() | |