Spaces:

damoojeje
/

SmartManuals-AI

Sleeping

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

c36ee8b verified 7 months ago

raw

history blame

6.1 kB

	# ✅ SmartManuals-AI App for Hugging Face Spaces
	# Full app.py with spaCy-based sentence segmentation and model dropdown selection
	import io
	import os
	import json
	import fitz # PyMuPDF
	import chromadb
	import torch
	import docx
	import gradio as gr
	import pytesseract
	import numpy as np
	import spacy
	from tqdm import tqdm
	from PIL import Image
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from sentence_transformers import SentenceTransformer, util

	# ---------------------------
	# ⚙️ Configuration
	# ---------------------------
	MANUALS_DIR = "./Manuals"
	CHROMA_PATH = "./chroma_store"
	CHROMA_COLLECTION = "manual_chunks"
	CHUNK_SIZE = 750
	CHUNK_OVERLAP = 100
	EMBED_MODEL = "all-MiniLM-L6-v2"
	DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct"
	AVAILABLE_MODELS = [
	"meta-llama/Llama-3-8B-Instruct",
	"meta-llama/Llama-4-Scout-17B-16E-Instruct",
	"google/gemma-1.1-7b-it",
	"mistralai/Mistral-7B-Instruct-v0.3",
	"Qwen/Qwen1.5-7B-Chat"
	]
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# ---------------------------
	# 📚 Load NLP model for sentence splitting
	# ---------------------------
	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	except:
	os.system("python -m spacy download en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	def split_sentences(text):
	return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()]

	# ---------------------------
	# 🧹 Text cleanup
	# ---------------------------
	def clean(text):
	return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	# ---------------------------
	# 📄 PDF and DOCX extractors
	# ---------------------------
	def extract_pdf_text(path):
	doc = fitz.open(path)
	pages = []
	for i, page in enumerate(doc):
	text = page.get_text()
	if not text.strip():
	pix = page.get_pixmap(dpi=300)
	img = Image.open(io.BytesIO(pix.tobytes("png")))
	text = pytesseract.image_to_string(img)
	pages.append((i + 1, text))
	return pages

	def extract_docx_text(path):
	doc = docx.Document(path)
	full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
	return [(1, full_text)]

	# ---------------------------
	# 📦 Chunk splitter
	# ---------------------------
	def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	chunks = []
	current = []
	length = 0
	for s in sentences:
	tokens = len(s.split())
	if length + tokens > max_tokens:
	chunks.append(" ".join(current))
	current = current[-overlap:]
	length = sum(len(w.split()) for w in current)
	current.append(s)
	length += tokens
	if current:
	chunks.append(" ".join(current))
	return chunks

	# ---------------------------
	# 🔎 Metadata from file
	# ---------------------------
	def extract_meta(name):
	name = name.lower()
	return {
	"model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"),
	"doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"),
	"brand": "life fitness"
	}

	# ---------------------------
	# 🔠 Embed and store chunks
	# ---------------------------
	def embed_all():
	embedder = SentenceTransformer(EMBED_MODEL)
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	try:
	client.delete_collection(CHROMA_COLLECTION)
	except:
	pass
	db = client.create_collection(CHROMA_COLLECTION)

	for fname in os.listdir(MANUALS_DIR):
	path = os.path.join(MANUALS_DIR, fname)
	if fname.endswith(".pdf"):
	pages = extract_pdf_text(path)
	elif fname.endswith(".docx"):
	pages = extract_docx_text(path)
	else:
	continue
	meta = extract_meta(fname)
	for page, text in pages:
	sents = split_sentences(clean(text))
	chunks = chunkify(sents)
	for i, chunk in enumerate(chunks):
	db.add(
	ids=[f"{fname}::p{page}::c{i}"],
	documents=[chunk],
	metadatas=[{**meta, "source": fname, "page": page}]
	)
	return db, embedder

	# ---------------------------
	# 🤖 Load selected LLM model
	# ---------------------------
	def load_model(repo):
	tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(
	repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN
	)
	return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

	# ---------------------------
	# 📥 Retrieval-Augmented QA
	# ---------------------------
	def answer_query(q, model_choice):
	results = db.query(query_texts=[q], n_results=3)
	context = "\n\n".join(results["documents"][0])
	prompt = f"""
	You are a helpful assistant. Answer based on the context. If unsure, say "I don't know".

	Context:
	{context}

	Question: {q}
	Answer:
	"""
	pipe = load_model(model_choice)
	out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"]
	return out.split("Answer:")[-1].strip()

	# ---------------------------
	# 🚀 Initialize app
	# ---------------------------
	print("Embedding documents...")
	db, embedder = embed_all()
	print("Done embedding.")

	# ---------------------------
	# 🎛️ Gradio UI
	# ---------------------------
	demo = gr.Blocks()

	with demo:
	gr.Markdown("""# 🧠 SmartManuals-AI
	Ask any question and let the model answer from your uploaded manuals.
	""")
	with gr.Row():
	qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?")
	model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL)
	ansbox = gr.Textbox(label="Answer", lines=10)
	btn = gr.Button("🔍 Submit")
	btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox)

	demo.launch()