John Ho commited on
Commit
0bf8729
·
1 Parent(s): 71ef59e

init commit

Browse files
.github/workflows/deploy_to_hf_space.yaml CHANGED
@@ -77,4 +77,4 @@ jobs:
77
  if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
78
  env:
79
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
80
- run: git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/HF_USERNAME/SPACE_NAME main"
 
77
  if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
78
  env:
79
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
80
+ run: git push https://GF-John:$HF_TOKEN@huggingface.co/spaces/GF-John/cam-motion main"
app.py CHANGED
@@ -1,12 +1,103 @@
 
1
  import gradio as gr
 
 
2
 
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name
 
 
 
 
6
 
 
7
 
8
- # We instantiate the Textbox class
9
- textbox = gr.Textbox(label="Type your name here:", placeholder="John Doe", lines=2)
10
 
11
- app = gr.Interface(fn=greet, inputs=textbox, outputs="text")
12
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
  import gradio as gr
3
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
4
+ from qwen_vl_utils import process_vision_info
5
 
6
+ # --- Installing Flash Attention for ZeroGPU is special --- #
7
+ import subprocess
8
 
9
+ subprocess.run(
10
+ "pip install flash-attn --no-build-isolation",
11
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
12
+ shell=True,
13
+ )
14
+ # --- now we got Flash Attention ---#
15
 
16
+ # The model is trained on 8.0 FPS which we recommend for optimal inference
17
 
 
 
18
 
19
+ @spaces.GPU(duration=30)
20
+ def load_model(
21
+ model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
22
+ use_flash_attention: bool = True,
23
+ ):
24
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
25
+ model = (
26
+ Qwen2_5_VLForConditionalGeneration.from_pretrained(
27
+ model_name,
28
+ torch_dtype=torch.bfloat16,
29
+ attn_implementation="flash_attention_2",
30
+ device_map="cuda",
31
+ )
32
+ if use_flash_attention
33
+ else Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
+ model_name,
35
+ torch_dtype="auto",
36
+ device_map="cuda",
37
+ )
38
+ )
39
+ return model
40
+
41
+
42
+ @spaces.GPU(duration=120)
43
+ def inference(video_path: str):
44
+ # default processor
45
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
46
+
47
+ messages = [
48
+ {
49
+ "role": "user",
50
+ "content": [
51
+ {
52
+ "type": "video",
53
+ "video": video_path,
54
+ "fps": 8.0,
55
+ },
56
+ {"type": "text", "text": "Describe the camera motion in this video."},
57
+ ],
58
+ }
59
+ ]
60
+
61
+ text = processor.apply_chat_template(
62
+ messages, tokenize=False, add_generation_prompt=True
63
+ )
64
+ image_inputs, video_inputs, video_kwargs = process_vision_info(
65
+ messages, return_video_kwargs=True
66
+ )
67
+ inputs = processor(
68
+ text=[text],
69
+ images=image_inputs,
70
+ videos=video_inputs,
71
+ fps=fps,
72
+ padding=True,
73
+ return_tensors="pt",
74
+ **video_kwargs,
75
+ )
76
+ inputs = inputs.to("cuda")
77
+
78
+ # Inference
79
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
80
+ generated_ids_trimmed = [
81
+ out_ids[len(in_ids) :]
82
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
83
+ ]
84
+ output_text = processor.batch_decode(
85
+ generated_ids_trimmed,
86
+ skip_special_tokens=True,
87
+ clean_up_tokenization_spaces=False,
88
+ )
89
+ return output_text
90
+
91
+
92
+ demo = gr.Interface(
93
+ fn=inference,
94
+ inputs=[
95
+ gr.Video(label="Input Video"),
96
+ ],
97
+ outputs=gr.JSON(label="Output JSON"),
98
+ title="",
99
+ api_name="video_inference",
100
+ )
101
+ demo.launch(
102
+ mcp_server=True, app_kwargs={"docs_url": "/docs"} # add FastAPI Swagger API Docs
103
+ )
pyproject.toml CHANGED
@@ -6,4 +6,8 @@ readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
  "gradio>=5.38.0",
 
 
 
 
9
  ]
 
6
  requires-python = ">=3.10"
7
  dependencies = [
8
  "gradio>=5.38.0",
9
+ "transformers==4.44.0",
10
+ "pydantic==2.10.6",
11
+ "loguru>=0.7.3",
12
+ "qwen-vl-utils>=0.0.11"
13
  ]