DeepGlint-AI
/

UniME-LLaVA-OneVision-7B

@@ -1,15 +1,15 @@
 ---
-license: mit
 datasets:
 - TIGER-Lab/MMEB-train
 language:
 - en
 metrics:
 - recall
-base_model:
-- llava-hf/llava-onevision-qwen2-7b-ov-hf
-pipeline_tag: image-text-to-text
-library_name: transformers
 ---
 # Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs
@@ -72,14 +72,17 @@ def appply_chat_template(image=None, text=None):
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": "Summary above image in one word:\n"},
                     ],
             }]
     elif text!= None:
         conversation_image = [{
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": f"{text}\nSummary above sentence in one word:\n"},
                     ],
             }]
     return conversation_image

 ---
+base_model:
+- llava-hf/llava-onevision-qwen2-7b-ov-hf
 datasets:
 - TIGER-Lab/MMEB-train
 language:
 - en
+library_name: transformers
+license: mit
 metrics:
 - recall
+pipeline_tag: zero-shot-image-classification
 ---
 # Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": "Summary above image in one word:
+"},
                     ],
             }]
     elif text!= None:
         conversation_image = [{
                 "role": "user",
                 "content": [
+                    {"type": "text", "text": f"{text}
+Summary above sentence in one word:
+"},
                     ],
             }]
     return conversation_image