Spaces:

zamal
/

Multimodal-Chat-Playground

Sleeping

App Files Files Community

zamal commited on May 30

Commit

cd8c42c

verified ·

1 Parent(s): 5580220

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -22

app.py CHANGED Viewed

@@ -37,7 +37,8 @@ from utils import *
 # Load .env
 load_dotenv()
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 # OCR + multimodal image description setup
 ocr_model = ocr_predictor(
     "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
@@ -52,9 +53,20 @@ vision_model = LlavaNextForConditionalGeneration.from_pretrained(
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
-    """Generate a one-sentence description via LlavaNext."""
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
@@ -143,42 +155,45 @@ OCR_CHOICES = {
     "db_resnet50 + crnn_resnet31":          ("db_resnet50", "crnn_resnet31"),
 }
 def extract_data_from_pdfs(
-    docs,
-    session,
-    include_images,    # "Include Images" or "Exclude Images"
-    do_ocr,            # "Get Text With OCR" or "Get Available Text Only"
-    ocr_choice,        # key into OCR_CHOICES
-    vlm_choice,        # HF repo ID for LlavaNext
     progress=gr.Progress()
 ):
     """
     1) Dynamically instantiate the chosen OCR pipeline (if any)
     2) Dynamically instantiate the chosen vision‐language model
-    3) Override the global get_image_description to use that model for captions
     4) Extract text & images, index into ChromaDB
     """
     if not docs:
         raise gr.Error("No documents to process")
-    # ——— 1) Set up OCR if requested ————————————————
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
-    # ——— 2) Set up vision‐language model —————————————
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
-    vis  = LlavaNextForConditionalGeneration.from_pretrained(
-        vlm_choice,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True
-    ).to("cuda")
-    # ——— 3) Monkey‐patch global get_image_description ————
     def describe(img: Image.Image) -> str:
-        torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
@@ -187,29 +202,35 @@ def extract_data_from_pdfs(
     global get_image_description
     get_image_description = describe
-    # ——— 4) Extract text & images ————————————————
     progress(0.2, "Extracting text and images…")
-    all_text, images, names = "", [], []
     for path in docs:
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
             txt = PdfReader(path).pages[0].extract_text() or ""
-            all_text += "\n\n" + txt + "\n\n"
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
-    # ——— 5) Index into vector DB ————————————————
     progress(0.6, "Indexing in vector DB…")
     vdb = get_vectordb(all_text, images, names)
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
         vdb,
         session,
@@ -218,6 +239,7 @@ def extract_data_from_pdfs(
         sample_imgs,
         "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
     vdb, question: str, num_ctx, img_ctx,

 # Load .env
 load_dotenv()
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+processor = None
+vision_model = None
 # OCR + multimodal image description setup
 ocr_model = ocr_predictor(
     "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
+    global processor, vision_model
+    # on first call, load & move to cuda
+    if processor is None or vision_model is None:
+        processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+        vision_model = LlavaNextForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-v1.6-mistral-7b-hf",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
+        ).to("cuda")
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
     "db_resnet50 + crnn_resnet31":          ("db_resnet50", "crnn_resnet31"),
 }
+@spaces.GPU()
 def extract_data_from_pdfs(
+    docs: list[str],
+    session: dict,
+    include_images: str,    # "Include Images" or "Exclude Images"
+    do_ocr: str,            # "Get Text With OCR" or "Get Available Text Only"
+    ocr_choice: str,        # key into OCR_CHOICES
+    vlm_choice: str,        # HF repo ID for LlavaNext
     progress=gr.Progress()
 ):
     """
     1) Dynamically instantiate the chosen OCR pipeline (if any)
     2) Dynamically instantiate the chosen vision‐language model
+    3) Monkey‐patch get_image_description to use that VL model
     4) Extract text & images, index into ChromaDB
     """
     if not docs:
         raise gr.Error("No documents to process")
+    # ——— 1) OCR setup (if requested) —————————————————————
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
+    # ——— 2) Vision‐language model setup ——————————————————
+    # Load processor + model *inside* the GPU worker
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
+    vis = (
+        LlavaNextForConditionalGeneration
+        .from_pretrained(vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+        .to("cuda")
+    )
+    # ——— 3) Monkey‐patch get_image_description —————————————————
     def describe(img: Image.Image) -> str:
+        torch.cuda.empty_cache()
+        gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
     global get_image_description
     get_image_description = describe
+    # ——— 4) Extract text & images —————————————————————
     progress(0.2, "Extracting text and images…")
+    all_text = ""
+    images, names = [], []
     for path in docs:
+        # text extraction
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
             txt = PdfReader(path).pages[0].extract_text() or ""
+            all_text += txt + "\n\n"
+        # image extraction
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
+    # ——— 5) Index into ChromaDB —————————————————————
     progress(0.6, "Indexing in vector DB…")
     vdb = get_vectordb(all_text, images, names)
+    # mark session done & prepare outputs
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
         vdb,
         session,
         sample_imgs,
         "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
     vdb, question: str, num_ctx, img_ctx,