Bee-8B-Stage3 / image_processing_bee.py

Upload folder using huggingface_hub

2395f55 verified 2 months ago

21.5 kB

	# coding=utf-8
	# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from collections.abc import Iterable
	from typing import Optional, Union

	import numpy as np

	from transformers.image_processing_utils import (
	BaseImageProcessor,
	BatchFeature,
	get_patch_output_size,
	get_size_dict,
	select_best_resolution,
	)
	from transformers.image_transforms import (
	PaddingMode,
	convert_to_rgb,
	pad,
	resize,
	to_channel_dimension_format,
	)
	from transformers.image_utils import (
	OPENAI_CLIP_MEAN,
	OPENAI_CLIP_STD,
	ChannelDimension,
	ImageInput,
	PILImageResampling,
	get_image_size,
	infer_channel_dimension_format,
	is_scaled_image,
	make_flat_list_of_images,
	to_numpy_array,
	valid_images,
	validate_preprocess_arguments,
	)
	from transformers.utils import TensorType, is_vision_available, logging

	logger = logging.get_logger(__name__)

	if is_vision_available():
	from PIL import Image


	# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
	def divide_to_patches(image: np.array, patch_size: int,
	input_data_format) -> list[np.array]:
	"""
	Divides an image into patches of a specified size.

	Args:
	image (`np.array`):
	The input image.
	patch_size (`int`):
	The size of each patch.
	input_data_format (`ChannelDimension` or `str`):
	The channel dimension format of the input image.

	Returns:
	list: A list of np.array representing the patches.
	"""
	patches = []
	height, width = get_image_size(image, channel_dim=input_data_format)
	for i in range(0, height, patch_size):
	for j in range(0, width, patch_size):
	if input_data_format == ChannelDimension.LAST:
	patch = image[i:i + patch_size, j:j + patch_size]
	else:
	patch = image[:, i:i + patch_size, j:j + patch_size]
	patches.append(patch)

	return patches


	# Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square
	def expand_to_square(image: np.array, background_color,
	input_data_format) -> np.array:
	"""
	Expands an image to a square by adding a background color.
	"""

	height, width = get_image_size(image, channel_dim=input_data_format)
	if width == height:
	return image
	elif width > height:
	result = np.ones((width, width, image.shape[2]),
	dtype=image.dtype) * background_color
	result[(width - height) // 2:(width - height) // 2 + height, :] = image
	return result
	else:
	result = np.ones((height, height, image.shape[2]),
	dtype=image.dtype) * background_color
	result[:, (height - width) // 2:(height - width) // 2 + width] = image
	return result


	class BeeImageProcessor(BaseImageProcessor):
	model_input_names = ["pixel_values_videos"]

	def __init__(
	self,
	do_resize: bool = True,
	size: Optional[dict[str, int]] = None,
	image_grid_pinpoints: Optional[list] = None,
	resample: PILImageResampling = PILImageResampling.BICUBIC,
	do_rescale: bool = True,
	rescale_factor: Union[int, float] = 1 / 255,
	do_normalize: bool = True,
	image_mean: Optional[Union[float, list[float]]] = None,
	image_std: Optional[Union[float, list[float]]] = None,
	do_pad: Optional[bool] = True,
	do_convert_rgb: bool = True,
	**kwargs,
	) -> None:
	super().__init__(**kwargs)
	size = size if size is not None else {"height": 384, "width": 384}
	size = get_size_dict(size, default_to_square=False)
	image_grid_pinpoints = (
	image_grid_pinpoints if image_grid_pinpoints is not None else
	[[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]])
	self.do_resize = do_resize
	self.size = size
	self.image_grid_pinpoints = image_grid_pinpoints
	self.resample = resample
	self.do_rescale = do_rescale
	self.rescale_factor = rescale_factor
	self.do_normalize = do_normalize
	self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
	self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
	self.do_pad = do_pad
	self.do_convert_rgb = do_convert_rgb

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.pad
	def pad(
	self,
	image: np.ndarray,
	padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
	mode: PaddingMode = PaddingMode.CONSTANT,
	constant_values: Union[float, Iterable[float]] = 0.0,
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.ndarray:

	# call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
	if isinstance(padding, int) or len(padding) != 4:
	return pad(image, padding, mode, constant_values, data_format,
	input_data_format)

	if input_data_format is None:
	input_data_format = infer_channel_dimension_format(image)
	if mode == PaddingMode.CONSTANT:
	image = np.pad(image,
	padding,
	mode="constant",
	constant_values=constant_values)
	elif mode == PaddingMode.REFLECT:
	image = np.pad(image, padding, mode="reflect")
	elif mode == PaddingMode.REPLICATE:
	image = np.pad(image, padding, mode="edge")
	elif mode == PaddingMode.SYMMETRIC:
	image = np.pad(image, padding, mode="symmetric")
	else:
	raise ValueError(f"Invalid padding mode: {mode}")
	image = (to_channel_dimension_format(image, data_format,
	input_data_format)
	if data_format is not None else image)
	return image

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching
	def _resize_for_patching(self, image: np.array, target_resolution: tuple,
	resample,
	input_data_format: ChannelDimension) -> np.array:

	new_height, new_width = get_patch_output_size(image, target_resolution,
	input_data_format)

	# Resize the image
	resized_image = resize(image, (new_height, new_width),
	resample=resample,
	input_data_format=input_data_format)

	return resized_image

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._get_padding_size
	def _get_padding_size(self, original_resolution: tuple,
	target_resolution: tuple):
	original_height, original_width = original_resolution
	target_height, target_width = target_resolution
	paste_x, r_x = divmod(target_width - original_width, 2)
	paste_y, r_y = divmod(target_height - original_height, 2)
	return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x)

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching
	def _pad_for_patching(self, image: np.array, target_resolution: tuple,
	input_data_format: ChannelDimension) -> np.array:
	"""
	Pad an image to a target resolution while maintaining aspect ratio.
	"""
	new_resolution = get_patch_output_size(image, target_resolution,
	input_data_format)
	padding = self._get_padding_size(new_resolution, target_resolution)

	padded_image = self.pad(image, padding=padding)

	return padded_image

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.get_image_patches
	def get_image_patches(
	self,
	image: np.array,
	grid_pinpoints,
	size: tuple,
	patch_size: int,
	resample: PILImageResampling,
	data_format: ChannelDimension,
	input_data_format: ChannelDimension,
	) -> list[np.array]:

	if not isinstance(grid_pinpoints, list):
	raise TypeError(
	"grid_pinpoints must be a list of possible resolutions.")

	possible_resolutions = grid_pinpoints

	image_size = get_image_size(image, channel_dim=input_data_format)
	best_resolution = select_best_resolution(image_size,
	possible_resolutions)
	resized_image = self._resize_for_patching(
	image,
	best_resolution,
	resample=resample,
	input_data_format=input_data_format)
	padded_image = self._pad_for_patching(
	resized_image,
	best_resolution,
	input_data_format=input_data_format)

	patches = divide_to_patches(padded_image,
	patch_size=patch_size,
	input_data_format=input_data_format)

	# make sure that all patches are in the input data format
	patches = [
	to_channel_dimension_format(patch,
	channel_dim=data_format,
	input_channel_dim=input_data_format)
	for patch in patches
	]

	resized_original_image = resize(
	image,
	size=size,
	resample=resample,
	data_format=data_format,
	input_data_format=input_data_format,
	)

	image_patches = [resized_original_image] + patches

	return image_patches

	# Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_batching
	def _pad_for_batching(
	self,
	pixel_values: list[np.ndarray],
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	):

	max_patch = max(len(x) for x in pixel_values)
	pixel_values = [
	self.pad(
	image,
	padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0,
	0)),
	data_format=data_format,
	input_data_format=input_data_format,
	) for image in pixel_values
	]

	return pixel_values

	# Copied from transformers.models.llava.image_processing_llava.LlavaImageProcessor.pad_to_square
	def pad_to_square(
	self,
	image: np.ndarray,
	background_color: Union[int, tuple[int, int, int]] = 0,
	data_format: Optional[Union[str, ChannelDimension]] = None,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> np.array:

	height, width = get_image_size(image, input_data_format)
	num_channels = image.shape[
	0] if input_data_format == ChannelDimension.FIRST else image.shape[
	-1]

	if height == width:
	image = (to_channel_dimension_format(image, data_format,
	input_data_format)
	if data_format is not None else image)
	return image

	max_dim = max(height, width)

	# Ensure background_color is the correct shape
	if isinstance(background_color, int):
	background_color = [background_color]
	elif len(background_color) != num_channels:
	raise ValueError(
	f"background_color must have no more than {num_channels} elements to match the number of channels"
	)

	if input_data_format == ChannelDimension.FIRST:
	result = np.zeros((num_channels, max_dim, max_dim),
	dtype=image.dtype)
	for i, color in enumerate(background_color):
	result[i, :, :] = color
	if width > height:
	start = (max_dim - height) // 2
	result[:, start:start + height, :] = image
	else:
	start = (max_dim - width) // 2
	result[:, :, start:start + width] = image
	else:
	result = np.zeros((max_dim, max_dim, num_channels),
	dtype=image.dtype)
	for i, color in enumerate(background_color):
	result[:, :, i] = color
	if width > height:
	start = (max_dim - height) // 2
	result[start:start + height, :, :] = image
	else:
	start = (max_dim - width) // 2
	result[:, start:start + width, :] = image

	image = (to_channel_dimension_format(result, data_format,
	input_data_format)
	if data_format is not None else result)
	return image

	def _preprocess(
	self,
	images: ImageInput,
	do_resize: Optional[bool] = None,
	size: Optional[dict[str, int]] = None,
	resample: PILImageResampling = None,
	do_rescale: Optional[bool] = None,
	rescale_factor: Optional[float] = None,
	do_normalize: Optional[bool] = None,
	image_mean: Optional[Union[float, list[float]]] = None,
	image_std: Optional[Union[float, list[float]]] = None,
	do_convert_rgb: Optional[bool] = None,
	data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	) -> Image.Image:

	if do_resize:
	images = [
	resize(image=image,
	size=size,
	resample=resample,
	input_data_format=input_data_format) for image in images
	]

	if do_rescale:
	images = [
	self.rescale(image=image,
	scale=rescale_factor,
	input_data_format=input_data_format)
	for image in images
	]

	if do_normalize:
	images = [
	self.normalize(image=image,
	mean=image_mean,
	std=image_std,
	input_data_format=input_data_format)
	for image in images
	]

	images = [
	to_channel_dimension_format(image,
	data_format,
	input_channel_dim=input_data_format)
	for image in images
	]

	return images

	def preprocess(
	self,
	images: ImageInput,
	do_resize: Optional[bool] = None,
	size: Optional[dict[str, int]] = None,
	image_grid_pinpoints: Optional[list] = None,
	resample: PILImageResampling = None,
	do_rescale: Optional[bool] = None,
	rescale_factor: Optional[float] = None,
	do_normalize: Optional[bool] = None,
	image_mean: Optional[Union[float, list[float]]] = None,
	image_std: Optional[Union[float, list[float]]] = None,
	do_pad: Optional[bool] = None,
	do_convert_rgb: Optional[bool] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
	input_data_format: Optional[Union[str, ChannelDimension]] = None,
	):
	do_resize = do_resize if do_resize is not None else self.do_resize
	size = size if size is not None else self.size
	size = get_size_dict(size, default_to_square=False)
	image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
	resample = resample if resample is not None else self.resample
	do_rescale = do_rescale if do_rescale is not None else self.do_rescale
	rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
	do_normalize = do_normalize if do_normalize is not None else self.do_normalize
	image_mean = image_mean if image_mean is not None else self.image_mean
	image_std = image_std if image_std is not None else self.image_std
	do_pad = do_pad if do_pad is not None else self.do_pad
	do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb

	if isinstance(images,
	(tuple, list)) and isinstance(images[0], (tuple, list)):
	# if the first element is a list, we assume that all elements are lists
	batch_num_images = [len(x) for x in images]
	elif isinstance(images, (tuple, list)):
	# treat this as a single-image case for backward compatibility
	batch_num_images = [1] * len(images)
	else:
	batch_num_images = [1]
	# only single image patching is supported
	need_patching = [n == 1 for n in batch_num_images for _ in range(n)]

	images = make_flat_list_of_images(images)

	if not valid_images(images):
	raise ValueError(
	"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
	"torch.Tensor, tf.Tensor or jax.ndarray.")

	validate_preprocess_arguments(
	do_rescale=do_rescale,
	rescale_factor=rescale_factor,
	do_normalize=do_normalize,
	image_mean=image_mean,
	image_std=image_std,
	do_resize=do_resize,
	size=size,
	resample=resample,
	)

	if do_convert_rgb:
	images = [convert_to_rgb(image) for image in images]

	# All transformations expect numpy arrays.
	images = [to_numpy_array(image) for image in images]

	if do_rescale and is_scaled_image(images[0]):
	logger.warning_once(
	"It looks like you are trying to rescale already rescaled images. If the input"
	" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
	)

	if input_data_format is None:
	# We assume that all images have the same channel dimension format.
	input_data_format = infer_channel_dimension_format(images[0])

	size_tuple = ((size["height"], size["width"])
	if "height" in size and "width" in size else
	(size["shortest_edge"], size["shortest_edge"]))

	new_images = []
	image_sizes = [
	get_image_size(image, channel_dim=input_data_format)
	for image in images
	]
	for i, image in enumerate(images):
	if need_patching[i]:
	# convert image into a list of patches
	# we intentionally use the same data format as the input data format
	image_patches = self.get_image_patches(
	image,
	image_grid_pinpoints,
	size=size_tuple,
	patch_size=size_tuple[0],
	resample=resample,
	data_format=input_data_format,
	input_data_format=input_data_format,
	)
	else:
	padded_image = self.pad_to_square(
	image=image,
	background_color=tuple(
	int(x * 255) for x in self.image_mean),
	input_data_format=input_data_format,
	)
	image_patches = [padded_image]

	# preprocess patches
	pixel_values = self._preprocess(
	image_patches,
	do_resize=do_resize,
	size=size_tuple,
	resample=resample,
	do_rescale=do_rescale,
	rescale_factor=rescale_factor,
	do_normalize=do_normalize,
	image_mean=image_mean,
	image_std=image_std,
	data_format=data_format,
	input_data_format=input_data_format,
	)
	pixel_values = np.array(pixel_values)
	new_images.append(pixel_values)

	if do_pad:
	processed_images = self._pad_for_batching(new_images)

	return BatchFeature(
	data={
	"pixel_values": processed_images,
	"image_sizes": image_sizes,
	"batch_num_images": batch_num_images
	},
	tensor_type=return_tensors,
	)


	__all__ = ["BeeImageProcessor"]