Improve model card: Add paper abstract and standardize pipeline tag casing

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +70 -34
README.md CHANGED
@@ -1,17 +1,17 @@
1
  ---
2
- license: mit
3
- pipeline_tag: image-text-to-text
4
- library_name: transformers
5
  base_model:
6
- - OpenGVLab/InternVL2_5-4B
7
- base_model_relation: finetune
8
  datasets:
9
- - OpenGVLab/MMPR-v1.1
10
  language:
11
- - multilingual
 
 
 
12
  tags:
13
- - internvl
14
- - custom_code
 
15
  ---
16
 
17
  # InternVL2_5-4B-MPO
@@ -24,6 +24,14 @@ tags:
24
  <img width="500" alt="image" src="https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/zJsd2hqd3EevgXo6fNgC-.png">
25
  </div>
26
 
 
 
 
 
 
 
 
 
27
  ## Introduction
28
 
29
  We introduce InternVL2.5-MPO, an advanced multimodal large language model (MLLM) series that demonstrates superior overall performance. This series builds upon InternVL2.5 and Mixed Preference Optimization.
@@ -113,7 +121,7 @@ Additionally, the BCO loss is employed as the quality loss, which helps the mode
113
  The loss function is defined as:
114
 
115
  $$
116
- \mathcal{L}_{\text{q}}=\mathcal{L}_{\text{q}}^+ + \mathcal{L}_{\text{q}}^-,
117
  $$
118
 
119
  where \\(\mathcal{L}_{\text{q}}^{+}\\) and \\(\mathcal{L}_{\text{q}}^{+}\\) represent the loss for chosen and rejected responses, respectively.
@@ -216,7 +224,6 @@ def split_model(model_name):
216
  num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
217
  layer_cnt = 0
218
  for i, num_layer in enumerate(num_layers_per_gpu):
219
- for j in range(num_layer):
220
  device_map[f'language_model.model.layers.{layer_cnt}'] = i
221
  layer_cnt += 1
222
  device_map['vision_model'] = 0
@@ -344,40 +351,50 @@ generation_config = dict(max_new_tokens=1024, do_sample=True)
344
  # pure-text conversation (纯文本对话)
345
  question = 'Hello, who are you?'
346
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
347
- print(f'User: {question}\nAssistant: {response}')
 
348
 
349
  question = 'Can you tell me a story?'
350
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
351
- print(f'User: {question}\nAssistant: {response}')
 
352
 
353
  # single-image single-round conversation (单图单轮对话)
354
- question = '<image>\nPlease describe the image shortly.'
 
355
  response = model.chat(tokenizer, pixel_values, question, generation_config)
356
- print(f'User: {question}\nAssistant: {response}')
 
357
 
358
  # single-image multi-round conversation (单图多轮对话)
359
- question = '<image>\nPlease describe the image in detail.'
 
360
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
361
- print(f'User: {question}\nAssistant: {response}')
 
362
 
363
  question = 'Please write a poem according to the image.'
364
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
365
- print(f'User: {question}\nAssistant: {response}')
 
366
 
367
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
368
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
369
  pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
370
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
371
 
372
- question = '<image>\nDescribe the two images in detail.'
 
373
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
374
  history=None, return_history=True)
375
- print(f'User: {question}\nAssistant: {response}')
 
376
 
377
  question = 'What are the similarities and differences between these two images.'
378
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
379
  history=history, return_history=True)
380
- print(f'User: {question}\nAssistant: {response}')
 
381
 
382
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
383
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -385,17 +402,21 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
385
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
386
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
387
 
388
- question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
 
 
389
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
390
  num_patches_list=num_patches_list,
391
  history=None, return_history=True)
392
- print(f'User: {question}\nAssistant: {response}')
 
393
 
394
  question = 'What are the similarities and differences between these two images.'
395
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
396
  num_patches_list=num_patches_list,
397
  history=history, return_history=True)
398
- print(f'User: {question}\nAssistant: {response}')
 
399
 
400
  # batch inference, single image per sample (单图批处理)
401
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -403,13 +424,15 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
403
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
404
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
405
 
406
- questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
 
407
  responses = model.batch_chat(tokenizer, pixel_values,
408
  num_patches_list=num_patches_list,
409
  questions=questions,
410
  generation_config=generation_config)
411
  for question, response in zip(questions, responses):
412
- print(f'User: {question}\nAssistant: {response}')
 
413
 
414
  # video multi-round conversation (视频多轮对话)
415
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -447,17 +470,24 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
447
  video_path = './examples/red-panda.mp4'
448
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
449
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
450
- video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
 
451
  question = video_prefix + 'What is the red panda doing?'
452
- # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
 
 
 
 
453
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
454
  num_patches_list=num_patches_list, history=None, return_history=True)
455
- print(f'User: {question}\nAssistant: {response}')
 
456
 
457
  question = 'Describe this video in detail.'
458
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
459
  num_patches_list=num_patches_list, history=history, return_history=True)
460
- print(f'User: {question}\nAssistant: {response}')
 
461
  ```
462
 
463
  #### Streaming Output
@@ -539,7 +569,9 @@ image_urls=[
539
 
540
  images = [load_image(img_url) for img_url in image_urls]
541
  # Numbering images improves multi-image conversations
542
- response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
 
 
543
  print(response.text)
544
  ```
545
 
@@ -648,8 +680,12 @@ If you find this project useful in your research, please consider citing:
648
  @article{chen2024far,
649
  title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
650
  author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
651
- journal={arXiv preprint arXiv:2404.16821},
652
- year={2024}
 
 
 
 
653
  }
654
  @inproceedings{chen2024internvl,
655
  title={Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks},
@@ -658,4 +694,4 @@ If you find this project useful in your research, please consider citing:
658
  pages={24185--24198},
659
  year={2024}
660
  }
661
- ```
 
1
  ---
 
 
 
2
  base_model:
3
+ - OpenGVLab/InternVL2_5-4B
 
4
  datasets:
5
+ - OpenGVLab/MMPR-v1.1
6
  language:
7
+ - multilingual
8
+ library_name: transformers
9
+ license: mit
10
+ pipeline_tag: IMAGE_TEXT_TO_TEXT
11
  tags:
12
+ - internvl
13
+ - custom_code
14
+ base_model_relation: finetune
15
  ---
16
 
17
  # InternVL2_5-4B-MPO
 
24
  <img width="500" alt="image" src="https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/zJsd2hqd3EevgXo6fNgC-.png">
25
  </div>
26
 
27
+ ## Paper Information
28
+
29
+ The model was presented in the paper [Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling](https://huggingface.co/papers/2412.05271).
30
+
31
+ ## Abstract
32
+
33
+ We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision encoders, language models, dataset sizes, and test-time configurations. Through extensive evaluations on a wide range of benchmarks, including multi-discipline reasoning, document understanding, multi-image / video understanding, real-world comprehension, multimodal hallucination detection, visual grounding, multilingual capabilities, and pure language processing, InternVL 2.5 exhibits competitive performance, rivaling leading commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a 3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing strong potential for test-time scaling. We hope this model contributes to the open-source community by setting new standards for developing and applying multimodal AI systems. HuggingFace demo see this https URL
34
+
35
  ## Introduction
36
 
37
  We introduce InternVL2.5-MPO, an advanced multimodal large language model (MLLM) series that demonstrates superior overall performance. This series builds upon InternVL2.5 and Mixed Preference Optimization.
 
121
  The loss function is defined as:
122
 
123
  $$
124
+ \mathcal{L}_{\text{q}}=\mathcal{L}_{\text{q}}^+ + \mathcal{L}_{\text{q}}^-,\
125
  $$
126
 
127
  where \\(\mathcal{L}_{\text{q}}^{+}\\) and \\(\mathcal{L}_{\text{q}}^{+}\\) represent the loss for chosen and rejected responses, respectively.
 
224
  num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
225
  layer_cnt = 0
226
  for i, num_layer in enumerate(num_layers_per_gpu):
 
227
  device_map[f'language_model.model.layers.{layer_cnt}'] = i
228
  layer_cnt += 1
229
  device_map['vision_model'] = 0
 
351
  # pure-text conversation (纯文本对话)
352
  question = 'Hello, who are you?'
353
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
354
+ print(f'User: {question}
355
+ Assistant: {response}')
356
 
357
  question = 'Can you tell me a story?'
358
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
359
+ print(f'User: {question}
360
+ Assistant: {response}')
361
 
362
  # single-image single-round conversation (单图单轮对话)
363
+ question = '<image>
364
+ Please describe the image shortly.'
365
  response = model.chat(tokenizer, pixel_values, question, generation_config)
366
+ print(f'User: {question}
367
+ Assistant: {response}')
368
 
369
  # single-image multi-round conversation (单图多轮对话)
370
+ question = '<image>
371
+ Please describe the image in detail.'
372
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
373
+ print(f'User: {question}
374
+ Assistant: {response}')
375
 
376
  question = 'Please write a poem according to the image.'
377
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
378
+ print(f'User: {question}
379
+ Assistant: {response}')
380
 
381
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
382
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
383
  pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
384
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
385
 
386
+ question = '<image>
387
+ Describe the two images in detail.'
388
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
389
  history=None, return_history=True)
390
+ print(f'User: {question}
391
+ Assistant: {response}')
392
 
393
  question = 'What are the similarities and differences between these two images.'
394
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
395
  history=history, return_history=True)
396
+ print(f'User: {question}
397
+ Assistant: {response}')
398
 
399
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
400
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 
402
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
403
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
404
 
405
+ question = 'Image-1: <image>
406
+ Image-2: <image>
407
+ Describe the two images in detail.'
408
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
409
  num_patches_list=num_patches_list,
410
  history=None, return_history=True)
411
+ print(f'User: {question}
412
+ Assistant: {response}')
413
 
414
  question = 'What are the similarities and differences between these two images.'
415
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
416
  num_patches_list=num_patches_list,
417
  history=history, return_history=True)
418
+ print(f'User: {question}
419
+ Assistant: {response}')
420
 
421
  # batch inference, single image per sample (单图批处理)
422
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 
424
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
425
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
426
 
427
+ questions = ['<image>
428
+ Describe the image in detail.'] * len(num_patches_list)
429
  responses = model.batch_chat(tokenizer, pixel_values,
430
  num_patches_list=num_patches_list,
431
  questions=questions,
432
  generation_config=generation_config)
433
  for question, response in zip(questions, responses):
434
+ print(f'User: {question}
435
+ Assistant: {response}')
436
 
437
  # video multi-round conversation (视频多轮对话)
438
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
470
  video_path = './examples/red-panda.mp4'
471
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
472
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
473
+ video_prefix = ''.join([f'Frame{i+1}: <image>
474
+ ' for i in range(len(num_patches_list))])
475
  question = video_prefix + 'What is the red panda doing?'
476
+ # Frame1: <image>
477
+ Frame2: <image>
478
+ ...
479
+ Frame8: <image>
480
+ {question}
481
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
482
  num_patches_list=num_patches_list, history=None, return_history=True)
483
+ print(f'User: {question}
484
+ Assistant: {response}')
485
 
486
  question = 'Describe this video in detail.'
487
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
488
  num_patches_list=num_patches_list, history=history, return_history=True)
489
+ print(f'User: {question}
490
+ Assistant: {response}')
491
  ```
492
 
493
  #### Streaming Output
 
569
 
570
  images = [load_image(img_url) for img_url in image_urls]
571
  # Numbering images improves multi-image conversations
572
+ response = pipe((f'Image-1: {IMAGE_TOKEN}
573
+ Image-2: {IMAGE_TOKEN}
574
+ describe these two images', images))
575
  print(response.text)
576
  ```
577
 
 
680
  @article{chen2024far,
681
  title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
682
  author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
683
+ journal={Science China Information Sciences},
684
+ volume={67},
685
+ number={12},
686
+ pages={220101},
687
+ year={2024},
688
+ publisher={Springer}
689
  }
690
  @inproceedings{chen2024internvl,
691
  title={Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks},
 
694
  pages={24185--24198},
695
  year={2024}
696
  }
697
+ ```