| from datasets import Dataset |
| from consts import REASONING_START, REASONING_END, SOLUTION_START, SOLUTION_END |
|
|
|
|
|
|
| def is_numeric_answer(example): |
| try: |
| float(example["answer"]) |
| return True |
| except Exception as e: |
| return f"error: {e}" |
|
|
| def resize_images(example): |
| image = example["decoded_image"] |
| image = image.resize((512,512)) |
| example["decoded_image"] = image |
| return example |
|
|
|
|
| def convert_to_rgb(example): |
| image = example["decoded_image"] |
| if image.mode != "RGB": |
| image = image.convert("RGB") |
| example["decoded_image"] = image |
| return example |
|
|
|
|
| def make_conversation(example): |
|
|
| text_content = ( |
| f"{example['question']}, provide your reasoning between {REASONING_START} and {REASONING_END} " |
| f"and then your final answer between {SOLUTION_START} and (put a float here) {SOLUTION_END}" |
| ) |
|
|
| prompt = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image"}, |
| {"type": "text", "text": text_content}, |
| ], |
| }, |
| ] |
|
|
| |
| return {"prompt": prompt, "image": example["decoded_image"], "answer": example["answer"]} |
|
|
|
|
|
|
| def dataset_setup(dataset: Dataset, tokenizer) -> Dataset: |
| dataset = dataset.filter(is_numeric_answer) |
| dataset = dataset.map(resize_images) |
| dataset = dataset.map(convert_to_rgb) |
| train_dataset = dataset.map(make_conversation) |
|
|
| |
| |
| train_dataset = train_dataset.remove_columns("image") |
| train_dataset = train_dataset.rename_column("decoded_image", "image") |
|
|
| train_dataset = train_dataset.map( |
| lambda example: { |
| "prompt": tokenizer.apply_chat_template( |
| example["prompt"], |
| tokenize=False, |
| add_generation_prompt=False |
| ) |
| } |
| ) |
| return train_dataset, dataset |
|
|
|
|