Spaces:

GF-John
/

video-caption

Running on Zero

App Files Files Community

John Ho commited on Jul 23

Commit

0bf8729

1 Parent(s): 71ef59e

init commit

Browse files

Files changed (3) hide show

.github/workflows/deploy_to_hf_space.yaml +1 -1
app.py +97 -6
pyproject.toml +4 -0

.github/workflows/deploy_to_hf_space.yaml CHANGED Viewed

@@ -77,4 +77,4 @@ jobs:
         if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/HF_USERNAME/SPACE_NAME main"

         if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://GF-John:$HF_TOKEN@huggingface.co/spaces/GF-John/cam-motion main"

app.py CHANGED Viewed

@@ -1,12 +1,103 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name
-# We instantiate the Textbox class
-textbox = gr.Textbox(label="Type your name here:", placeholder="John Doe", lines=2)
-app = gr.Interface(fn=greet, inputs=textbox, outputs="text")
-app.launch()

+import spaces
 import gradio as gr
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# --- Installing Flash Attention for ZeroGPU is special --- #
+import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+# --- now we got Flash Attention ---#
+# The model is trained on 8.0 FPS which we recommend for optimal inference
+@spaces.GPU(duration=30)
+def load_model(
+    model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
+    use_flash_attention: bool = True,
+):
+    # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+    model = (
+        Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map="cuda",
+        )
+        if use_flash_attention
+        else Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="cuda",
+        )
+    )
+    return model
+@spaces.GPU(duration=120)
+def inference(video_path: str):
+    # default processor
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": video_path,
+                    "fps": 8.0,
+                },
+                {"type": "text", "text": "Describe the camera motion in this video."},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages, return_video_kwargs=True
+    )
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        fps=fps,
+        padding=True,
+        return_tensors="pt",
+        **video_kwargs,
+    )
+    inputs = inputs.to("cuda")
+    # Inference
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    return output_text
+demo = gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.Video(label="Input Video"),
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="",
+    api_name="video_inference",
+)
+demo.launch(
+    mcp_server=True, app_kwargs={"docs_url": "/docs"}  # add FastAPI Swagger API Docs
+)

pyproject.toml CHANGED Viewed

@@ -6,4 +6,8 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "gradio>=5.38.0",
 ]

 requires-python = ">=3.10"
 dependencies = [
     "gradio>=5.38.0",
+    "transformers==4.44.0",
+    "pydantic==2.10.6",
+    "loguru>=0.7.3",
+    "qwen-vl-utils>=0.0.11"
 ]