Spaces:

insomnia7
/

SAMTok

Running on Zero

App Files Files Community

SAMTok / sam2.py

insomnia7

Upload ./sam2.py with huggingface_hub

4235c77 verified about 1 month ago

raw

history blame contribute delete

172 kB

	import os
	import copy
	import math
	from functools import partial
	import torch
	import torch.nn as nn
	from torch import Tensor
	from torch.nn.utils.rnn import pad_sequence
	import torch.nn.functional as F
	import torch.distributed as dist
	from torch.nn.init import trunc_normal_

	from typing import Any, Callable, Optional, Union, Iterable, Tuple, Type, List
	from dataclasses import dataclass
	import numpy as np

	from transformers import PreTrainedModel, PretrainedConfig
	from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
	from transformers.utils import can_return_tuple, ModelOutput
	from transformers.activations import ACT2FN


	class SAM2Config(PretrainedConfig):
	model_type = "sam2"
	base_config_key = "sam2_config"

	def __init__(
	self,
	# cfg_path: str = "sam2.1_hiera_l.yaml",
	ckpt_path: str = "sam2.1_hiera_large.pt",
	# hydra_overrides_extra = None,
	# apply_postprocessing = True,
	**kwargs
	):
	super().__init__(**kwargs)

	# self.cfg_path = cfg_path
	self.ckpt_path = ckpt_path

	# if hydra_overrides_extra is None:
	# hydra_overrides_extra = []
	# hydra_overrides = [
	# ## Extension: LLM prompt
	# "++model._target_=projects.transformers.vq_sam2.sam2_base.SAM2Base",
	# ]

	# if apply_postprocessing:
	# hydra_overrides_extra = hydra_overrides_extra.copy()
	# hydra_overrides_extra += [
	# # dynamically fall back to multi-mask if the single mask is not stable
	# # "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
	# # "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
	# # "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
	# # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
	# # "++model.binarize_mask_from_pts_for_mem_enc=true",
	# # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
	# # "++model.fill_hole_area=8",
	# ]
	# hydra_overrides.extend(hydra_overrides_extra)

	# # Read config and init model
	# cfg = compose(config_name=cfg_path, overrides=hydra_overrides)
	# OmegaConf.resolve(cfg)
	# self.cfg = cfg

	# def to_dict(self):
	# """重写 to_dict 方法以处理 OmegaConf 对象"""
	# output = super().to_dict()

	# # 处理 cfg 中的 OmegaConf 对象
	# if hasattr(self, 'cfg') and self.cfg is not None:
	# if hasattr(self.cfg, '_content') and hasattr(self.cfg, 'to_container'):
	# output['cfg'] = OmegaConf.to_container(self.cfg, resolve=True)
	# else:
	# output['cfg'] = self.cfg

	# return output


	class VQ_SAM2Config(PretrainedConfig):
	model_type = "vq_sam2"
	sub_configs = {
	"sam2_config": SAM2Config,
	}

	def __init__(
	self,
	sam2_config: SAM2Config = None,
	codebook_size: int = 1024,
	codebook_depth: int = 4,
	shared_codebook: bool = False,
	latent_dim: int = 256,
	# mask loss
	loss_sample_points: bool = False,
	num_points: int = 12544,
	oversample_ratio: float = 3.0,
	importance_sample_ratio: float = 0.75,
	# vq loss
	vq_loss_weight: float = 0.25,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.sam2_config = sam2_config
	self.codebook_size = codebook_size
	self.codebook_depth = codebook_depth
	self.shared_codebook = shared_codebook
	self.latent_dim = latent_dim

	# mask loss
	self.loss_sample_points = loss_sample_points
	self.num_points = num_points
	self.oversample_ratio = oversample_ratio
	self.importance_sample_ratio = importance_sample_ratio

	# vq loss
	self.vq_loss_weight = vq_loss_weight



	# def to_dict(self):
	# """重写 to_dict 方法以处理 OmegaConf 对象"""
	# output = super().to_dict()

	# # 处理 sam2_config 中的 OmegaConf 对象
	# if hasattr(self, 'sam2_config') and self.sam2_config is not None:
	# sam2_dict = {}
	# for key, value in self.sam2_config.__dict__.items():
	# if hasattr(value, '_content') and hasattr(value, 'to_container'):
	# # 这是 OmegaConf 对象
	# sam2_dict[key] = OmegaConf.to_container(value, resolve=True)
	# else:
	# sam2_dict[key] = value
	# output['sam2_config'] = sam2_dict

	# return output


	# Lightly adapted from
	# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
	class MLP(nn.Module):
	def __init__(
	self,
	input_dim: int,
	hidden_dim: int,
	output_dim: int,
	num_layers: int,
	activation: nn.Module = nn.ReLU,
	sigmoid_output: bool = False,
	) -> None:
	super().__init__()
	self.num_layers = num_layers
	h = [hidden_dim] * (num_layers - 1)
	self.layers = nn.ModuleList(
	nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
	)
	self.sigmoid_output = sigmoid_output
	self.act = activation()

	def forward(self, x):
	for i, layer in enumerate(self.layers):
	x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
	if self.sigmoid_output:
	x = F.sigmoid(x)
	return x

	# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
	# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
	class LayerNorm2d(nn.Module):
	def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
	super().__init__()
	self.weight = nn.Parameter(torch.ones(num_channels))
	self.bias = nn.Parameter(torch.zeros(num_channels))
	self.eps = eps

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	u = x.mean(1, keepdim=True)
	s = (x - u).pow(2).mean(1, keepdim=True)
	x = (x - u) / torch.sqrt(s + self.eps)
	x = self.weight[:, None, None] * x + self.bias[:, None, None]
	return x


	class MaskDecoder(nn.Module):
	def __init__(
	self,
	*,
	transformer_dim: int,
	transformer: nn.Module,
	num_multimask_outputs: int = 3,
	activation: Type[nn.Module] = nn.GELU,
	iou_head_depth: int = 3,
	iou_head_hidden_dim: int = 256,
	use_high_res_features: bool = False,
	iou_prediction_use_sigmoid=False,
	dynamic_multimask_via_stability=False,
	dynamic_multimask_stability_delta=0.05,
	dynamic_multimask_stability_thresh=0.98,
	pred_obj_scores: bool = False,
	pred_obj_scores_mlp: bool = False,
	use_multimask_token_for_obj_ptr: bool = False,
	) -> None:
	"""
	Predicts masks given an image and prompt embeddings, using a
	transformer architecture.

	Arguments:
	transformer_dim (int): the channel dimension of the transformer
	transformer (nn.Module): the transformer used to predict masks
	num_multimask_outputs (int): the number of masks to predict
	when disambiguating masks
	activation (nn.Module): the type of activation to use when
	upscaling masks
	iou_head_depth (int): the depth of the MLP used to predict
	mask quality
	iou_head_hidden_dim (int): the hidden dimension of the MLP
	used to predict mask quality
	"""
	super().__init__()
	self.transformer_dim = transformer_dim
	self.transformer = transformer

	self.num_multimask_outputs = num_multimask_outputs

	self.iou_token = nn.Embedding(1, transformer_dim)
	self.num_mask_tokens = num_multimask_outputs + 1
	self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)

	self.pred_obj_scores = pred_obj_scores
	if self.pred_obj_scores:
	self.obj_score_token = nn.Embedding(1, transformer_dim)
	self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr

	self.output_upscaling = nn.Sequential(
	nn.ConvTranspose2d(
	transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
	),
	LayerNorm2d(transformer_dim // 4),
	activation(),
	nn.ConvTranspose2d(
	transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
	),
	activation(),
	)
	self.use_high_res_features = use_high_res_features
	if use_high_res_features:
	self.conv_s0 = nn.Conv2d(
	transformer_dim, transformer_dim // 8, kernel_size=1, stride=1
	)
	self.conv_s1 = nn.Conv2d(
	transformer_dim, transformer_dim // 4, kernel_size=1, stride=1
	)

	self.output_hypernetworks_mlps = nn.ModuleList(
	[
	MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
	for i in range(self.num_mask_tokens)
	]
	)

	self.iou_prediction_head = MLP(
	transformer_dim,
	iou_head_hidden_dim,
	self.num_mask_tokens,
	iou_head_depth,
	sigmoid_output=iou_prediction_use_sigmoid,
	)
	if self.pred_obj_scores:
	self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
	if pred_obj_scores_mlp:
	self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)

	# When outputting a single mask, optionally we can dynamically fall back to the best
	# multimask output token if the single mask output token gives low stability scores.
	self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
	self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
	self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh

	def forward(
	self,
	image_embeddings: torch.Tensor,
	image_pe: torch.Tensor,
	sparse_prompt_embeddings: torch.Tensor,
	dense_prompt_embeddings: torch.Tensor,
	multimask_output: bool,
	repeat_image: bool,
	high_res_features: Optional[List[torch.Tensor]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Predict masks given image and prompt embeddings.

	Arguments:
	image_embeddings (torch.Tensor): the embeddings from the image encoder
	image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
	sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
	dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
	multimask_output (bool): Whether to return multiple masks or a single
	mask.

	Returns:
	torch.Tensor: batched predicted masks
	torch.Tensor: batched predictions of mask quality
	torch.Tensor: batched SAM token for mask output
	"""
	masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
	image_embeddings=image_embeddings,
	image_pe=image_pe,
	sparse_prompt_embeddings=sparse_prompt_embeddings,
	dense_prompt_embeddings=dense_prompt_embeddings,
	repeat_image=repeat_image,
	high_res_features=high_res_features,
	)

	# Select the correct mask or masks for output
	if multimask_output:
	masks = masks[:, 1:, :, :]
	iou_pred = iou_pred[:, 1:]
	elif self.dynamic_multimask_via_stability and not self.training:
	masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
	else:
	masks = masks[:, 0:1, :, :]
	iou_pred = iou_pred[:, 0:1]

	if multimask_output and self.use_multimask_token_for_obj_ptr:
	sam_tokens_out = mask_tokens_out[:, 1:] # [b, 3, c] shape
	else:
	# Take the mask output token. Here we always use the token for single mask output.
	# At test time, even if we track after 1-click (and using multimask_output=True),
	# we still take the single mask token here. The rationale is that we always track
	# after multiple clicks during training, so the past tokens seen during training
	# are always the single mask token (and we'll let it be the object-memory token).
	sam_tokens_out = mask_tokens_out[:, 0:1] # [b, 1, c] shape

	# Prepare output
	return masks, iou_pred, sam_tokens_out, object_score_logits

	def predict_masks(
	self,
	image_embeddings: torch.Tensor,
	image_pe: torch.Tensor,
	sparse_prompt_embeddings: torch.Tensor,
	dense_prompt_embeddings: torch.Tensor,
	repeat_image: bool,
	high_res_features: Optional[List[torch.Tensor]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Predicts masks. See 'forward' for more details."""
	# Concatenate output tokens
	s = 0
	if self.pred_obj_scores:
	output_tokens = torch.cat(
	[
	self.obj_score_token.weight,
	self.iou_token.weight,
	self.mask_tokens.weight,
	],
	dim=0,
	)
	s = 1
	else:
	output_tokens = torch.cat(
	[self.iou_token.weight, self.mask_tokens.weight], dim=0
	)
	output_tokens = output_tokens.unsqueeze(0).expand(
	sparse_prompt_embeddings.size(0), -1, -1
	).contiguous()
	tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)

	# Expand per-image data in batch direction to be per-mask
	if repeat_image:
	src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
	else:
	assert image_embeddings.shape[0] == tokens.shape[0]
	src = image_embeddings
	src = src + dense_prompt_embeddings
	assert (
	image_pe.size(0) == 1
	), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
	pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
	b, c, h, w = src.shape

	# Run the transformer
	hs, src = self.transformer(src, pos_src, tokens)
	iou_token_out = hs[:, s, :]
	mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]

	# Upscale mask embeddings and predict masks using the mask tokens
	src = src.transpose(1, 2).view(b, c, h, w).contiguous()
	if not self.use_high_res_features:
	upscaled_embedding = self.output_upscaling(src)
	else:
	dc1, ln1, act1, dc2, act2 = self.output_upscaling
	feat_s0, feat_s1 = high_res_features
	upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
	upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)

	hyper_in_list: List[torch.Tensor] = []
	for i in range(self.num_mask_tokens):
	hyper_in_list.append(
	self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
	)
	hyper_in = torch.stack(hyper_in_list, dim=1)
	b, c, h, w = upscaled_embedding.shape
	masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w).contiguous()

	# Generate mask quality predictions
	iou_pred = self.iou_prediction_head(iou_token_out)
	if self.pred_obj_scores:
	assert s == 1
	object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
	else:
	# Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
	object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)

	return masks, iou_pred, mask_tokens_out, object_score_logits

	def _get_stability_scores(self, mask_logits):
	"""
	Compute stability scores of the mask logits based on the IoU between upper and
	lower thresholds.
	"""
	mask_logits = mask_logits.flatten(-2)
	stability_delta = self.dynamic_multimask_stability_delta
	area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
	area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
	stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0)
	return stability_scores

	def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
	"""
	When outputting a single mask, if the stability score from the current single-mask
	output (based on output token 0) falls below a threshold, we instead select from
	multi-mask outputs (based on output token 1~3) the mask with the highest predicted
	IoU score. This is intended to ensure a valid mask for both clicking and tracking.
	"""
	# The best mask from multimask output tokens (1~3)
	multimask_logits = all_mask_logits[:, 1:, :, :]
	multimask_iou_scores = all_iou_scores[:, 1:]
	best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
	batch_inds = torch.arange(
	multimask_iou_scores.size(0), device=all_iou_scores.device
	)
	best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
	best_multimask_logits = best_multimask_logits.unsqueeze(1)
	best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
	best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)

	# The mask from singlemask output token 0 and its stability score
	singlemask_logits = all_mask_logits[:, 0:1, :, :]
	singlemask_iou_scores = all_iou_scores[:, 0:1]
	stability_scores = self._get_stability_scores(singlemask_logits)
	is_stable = stability_scores >= self.dynamic_multimask_stability_thresh

	# Dynamically fall back to best multimask output upon low stability scores.
	mask_logits_out = torch.where(
	is_stable[..., None, None].expand_as(singlemask_logits),
	singlemask_logits,
	best_multimask_logits,
	)
	iou_scores_out = torch.where(
	is_stable.expand_as(singlemask_iou_scores),
	singlemask_iou_scores,
	best_multimask_iou_scores,
	)
	return mask_logits_out, iou_scores_out

	class PositionEmbeddingRandom(nn.Module):
	"""
	Positional encoding using random spatial frequencies.
	"""

	def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
	super().__init__()
	if scale is None or scale <= 0.0:
	scale = 1.0
	self.register_buffer(
	"positional_encoding_gaussian_matrix",
	scale * torch.randn((2, num_pos_feats)),
	)

	def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
	"""Positionally encode points that are normalized to [0,1]."""
	# assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
	coords = 2 * coords - 1
	coords = coords @ self.positional_encoding_gaussian_matrix
	coords = 2 * np.pi * coords
	# outputs d_1 x ... x d_n x C shape
	return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)

	def forward(self, size: Tuple[int, int]) -> torch.Tensor:
	"""Generate positional encoding for a grid of the specified size."""
	h, w = size
	device: Any = self.positional_encoding_gaussian_matrix.device
	grid = torch.ones((h, w), device=device, dtype=torch.float32)
	y_embed = grid.cumsum(dim=0) - 0.5
	x_embed = grid.cumsum(dim=1) - 0.5
	y_embed = y_embed / h
	x_embed = x_embed / w

	pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
	return pe.permute(2, 0, 1) # C x H x W

	def forward_with_coords(
	self, coords_input: torch.Tensor, image_size: Tuple[int, int]
	) -> torch.Tensor:
	"""Positionally encode points that are not normalized to [0,1]."""
	coords = coords_input.clone()
	coords[:, :, 0] = coords[:, :, 0] / image_size[1]
	coords[:, :, 1] = coords[:, :, 1] / image_size[0]
	return self._pe_encoding(coords.to(torch.float)) # B x N x C

	class PromptEncoder(nn.Module):
	def __init__(
	self,
	embed_dim: int,
	image_embedding_size: Tuple[int, int],
	input_image_size: Tuple[int, int],
	mask_in_chans: int,
	activation: Type[nn.Module] = nn.GELU,
	) -> None:
	"""
	Encodes prompts for input to SAM's mask decoder.

	Arguments:
	embed_dim (int): The prompts' embedding dimension
	image_embedding_size (tuple(int, int)): The spatial size of the
	image embedding, as (H, W).
	input_image_size (int): The padded size of the image as input
	to the image encoder, as (H, W).
	mask_in_chans (int): The number of hidden channels used for
	encoding input masks.
	activation (nn.Module): The activation to use when encoding
	input masks.
	"""
	super().__init__()
	self.embed_dim = embed_dim
	self.input_image_size = input_image_size
	self.image_embedding_size = image_embedding_size
	self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)

	self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
	point_embeddings = [
	nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
	]
	self.point_embeddings = nn.ModuleList(point_embeddings)
	self.not_a_point_embed = nn.Embedding(1, embed_dim)

	self.mask_input_size = (
	4 * image_embedding_size[0],
	4 * image_embedding_size[1],
	)
	self.mask_downscaling = nn.Sequential(
	nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
	LayerNorm2d(mask_in_chans // 4),
	activation(),
	nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
	LayerNorm2d(mask_in_chans),
	activation(),
	nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
	)
	self.no_mask_embed = nn.Embedding(1, embed_dim)

	def get_dense_pe(self) -> torch.Tensor:
	"""
	Returns the positional encoding used to encode point prompts,
	applied to a dense set of points the shape of the image encoding.

	Returns:
	torch.Tensor: Positional encoding with shape
	1x(embed_dim)x(embedding_h)x(embedding_w)
	"""
	return self.pe_layer(self.image_embedding_size).unsqueeze(0)

	def _embed_points(
	self,
	points: torch.Tensor,
	labels: torch.Tensor,
	pad: bool,
	) -> torch.Tensor:
	"""Embeds point prompts."""
	points = points + 0.5 # Shift to center of pixel
	if pad:
	padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
	padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
	points = torch.cat([points, padding_point], dim=1)
	labels = torch.cat([labels, padding_label], dim=1)
	point_embedding = self.pe_layer.forward_with_coords(
	points, self.input_image_size
	)

	point_embedding = torch.where(
	(labels == -1).unsqueeze(-1),
	torch.zeros_like(point_embedding) + self.not_a_point_embed.weight,
	point_embedding,
	)
	point_embedding = torch.where(
	(labels == 0).unsqueeze(-1),
	point_embedding + self.point_embeddings[0].weight,
	point_embedding,
	)
	point_embedding = torch.where(
	(labels == 1).unsqueeze(-1),
	point_embedding + self.point_embeddings[1].weight,
	point_embedding,
	)
	point_embedding = torch.where(
	(labels == 2).unsqueeze(-1),
	point_embedding + self.point_embeddings[2].weight,
	point_embedding,
	)
	point_embedding = torch.where(
	(labels == 3).unsqueeze(-1),
	point_embedding + self.point_embeddings[3].weight,
	point_embedding,
	)
	return point_embedding

	def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
	"""Embeds box prompts."""
	boxes = boxes + 0.5 # Shift to center of pixel
	coords = boxes.reshape(-1, 2, 2).contiguous()
	corner_embedding = self.pe_layer.forward_with_coords(
	coords, self.input_image_size
	)
	corner_embedding[:, 0, :] += self.point_embeddings[2].weight
	corner_embedding[:, 1, :] += self.point_embeddings[3].weight
	return corner_embedding

	def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
	"""Embeds mask inputs."""
	mask_embedding = self.mask_downscaling(masks)
	return mask_embedding

	def _get_batch_size(
	self,
	points: Optional[Tuple[torch.Tensor, torch.Tensor]],
	boxes: Optional[torch.Tensor],
	masks: Optional[torch.Tensor],
	) -> int:
	"""
	Gets the batch size of the output given the batch size of the input prompts.
	"""
	if points is not None:
	return points[0].shape[0]
	elif boxes is not None:
	return boxes.shape[0]
	elif masks is not None:
	return masks.shape[0]
	else:
	return 1

	def _get_device(self) -> torch.device:
	return self.point_embeddings[0].weight.device

	def forward(
	self,
	points: Optional[Tuple[torch.Tensor, torch.Tensor]],
	boxes: Optional[torch.Tensor],
	masks: Optional[torch.Tensor],
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Embeds different types of prompts, returning both sparse and dense
	embeddings.

	Arguments:
	points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
	and labels to embed.
	boxes (torch.Tensor or none): boxes to embed
	masks (torch.Tensor or none): masks to embed

	Returns:
	torch.Tensor: sparse embeddings for the points and boxes, with shape
	BxNx(embed_dim), where N is determined by the number of input points
	and boxes.
	torch.Tensor: dense embeddings for the masks, in the shape
	Bx(embed_dim)x(embed_H)x(embed_W)
	"""
	bs = self._get_batch_size(points, boxes, masks)
	sparse_embeddings = torch.empty(
	(bs, 0, self.embed_dim), device=self._get_device()
	)
	if points is not None:
	coords, labels = points
	point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
	sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
	if boxes is not None:
	box_embeddings = self._embed_boxes(boxes)
	sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)

	if masks is not None:
	dense_embeddings = self._embed_masks(masks)
	else:
	dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
	bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
	).contiguous()

	return sparse_embeddings, dense_embeddings


	class TwoWayTransformer(nn.Module):
	def __init__(
	self,
	depth: int,
	embedding_dim: int,
	num_heads: int,
	mlp_dim: int,
	activation: Type[nn.Module] = nn.ReLU,
	attention_downsample_rate: int = 2,
	) -> None:
	"""
	A transformer decoder that attends to an input image using
	queries whose positional embedding is supplied.

	Args:
	depth (int): number of layers in the transformer
	embedding_dim (int): the channel dimension for the input embeddings
	num_heads (int): the number of heads for multihead attention. Must
	divide embedding_dim
	mlp_dim (int): the channel dimension internal to the MLP block
	activation (nn.Module): the activation to use in the MLP block
	"""
	super().__init__()
	self.depth = depth
	self.embedding_dim = embedding_dim
	self.num_heads = num_heads
	self.mlp_dim = mlp_dim
	self.layers = nn.ModuleList()

	for i in range(depth):
	self.layers.append(
	TwoWayAttentionBlock(
	embedding_dim=embedding_dim,
	num_heads=num_heads,
	mlp_dim=mlp_dim,
	activation=activation,
	attention_downsample_rate=attention_downsample_rate,
	skip_first_layer_pe=(i == 0),
	)
	)

	self.final_attn_token_to_image = Attention(
	embedding_dim, num_heads, downsample_rate=attention_downsample_rate
	)
	self.norm_final_attn = nn.LayerNorm(embedding_dim)

	def forward(
	self,
	image_embedding: Tensor,
	image_pe: Tensor,
	point_embedding: Tensor,
	) -> Tuple[Tensor, Tensor]:
	"""
	Args:
	image_embedding (torch.Tensor): image to attend to. Should be shape
	B x embedding_dim x h x w for any h and w.
	image_pe (torch.Tensor): the positional encoding to add to the image. Must
	have the same shape as image_embedding.
	point_embedding (torch.Tensor): the embedding to add to the query points.
	Must have shape B x N_points x embedding_dim for any N_points.

	Returns:
	torch.Tensor: the processed point_embedding
	torch.Tensor: the processed image_embedding
	"""
	# BxCxHxW -> BxHWxC == B x N_image_tokens x C
	bs, c, h, w = image_embedding.shape
	image_embedding = image_embedding.flatten(2).permute(0, 2, 1).contiguous()
	image_pe = image_pe.flatten(2).permute(0, 2, 1).contiguous()

	# Prepare queries
	queries = point_embedding
	keys = image_embedding

	# Apply transformer blocks and final layernorm
	for layer in self.layers:
	queries, keys = layer(
	queries=queries,
	keys=keys,
	query_pe=point_embedding,
	key_pe=image_pe,
	)

	# Apply the final attention layer from the points to the image
	q = queries + point_embedding
	k = keys + image_pe
	attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
	queries = queries + attn_out
	queries = self.norm_final_attn(queries)

	return queries, keys


	class TwoWayAttentionBlock(nn.Module):
	def __init__(
	self,
	embedding_dim: int,
	num_heads: int,
	mlp_dim: int = 2048,
	activation: Type[nn.Module] = nn.ReLU,
	attention_downsample_rate: int = 2,
	skip_first_layer_pe: bool = False,
	) -> None:
	"""
	A transformer block with four layers: (1) self-attention of sparse
	inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
	block on sparse inputs, and (4) cross attention of dense inputs to sparse
	inputs.

	Arguments:
	embedding_dim (int): the channel dimension of the embeddings
	num_heads (int): the number of heads in the attention layers
	mlp_dim (int): the hidden dimension of the mlp block
	activation (nn.Module): the activation of the mlp block
	skip_first_layer_pe (bool): skip the PE on the first layer
	"""
	super().__init__()
	self.self_attn = Attention(embedding_dim, num_heads)
	self.norm1 = nn.LayerNorm(embedding_dim)

	self.cross_attn_token_to_image = Attention(
	embedding_dim, num_heads, downsample_rate=attention_downsample_rate
	)
	self.norm2 = nn.LayerNorm(embedding_dim)

	self.mlp = MLP(
	embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation
	)
	self.norm3 = nn.LayerNorm(embedding_dim)

	self.norm4 = nn.LayerNorm(embedding_dim)
	self.cross_attn_image_to_token = Attention(
	embedding_dim, num_heads, downsample_rate=attention_downsample_rate
	)

	self.skip_first_layer_pe = skip_first_layer_pe

	def forward(
	self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
	) -> Tuple[Tensor, Tensor]:
	# Self attention block
	if self.skip_first_layer_pe:
	queries = self.self_attn(q=queries, k=queries, v=queries)
	else:
	q = queries + query_pe
	attn_out = self.self_attn(q=q, k=q, v=queries)
	queries = queries + attn_out
	queries = self.norm1(queries)

	# Cross attention block, tokens attending to image embedding
	q = queries + query_pe
	k = keys + key_pe
	attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
	queries = queries + attn_out
	queries = self.norm2(queries)

	# MLP block
	mlp_out = self.mlp(queries)
	queries = queries + mlp_out
	queries = self.norm3(queries)

	# Cross attention block, image embedding attending to tokens
	q = queries + query_pe
	k = keys + key_pe
	attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
	keys = keys + attn_out
	keys = self.norm4(keys)

	return queries, keys


	class Attention(nn.Module):
	"""
	An attention layer that allows for downscaling the size of the embedding
	after projection to queries, keys, and values.
	"""

	def __init__(
	self,
	embedding_dim: int,
	num_heads: int,
	downsample_rate: int = 1,
	dropout: float = 0.0,
	kv_in_dim: int = None,
	) -> None:
	super().__init__()
	self.embedding_dim = embedding_dim
	self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
	self.internal_dim = embedding_dim // downsample_rate
	self.num_heads = num_heads
	assert (
	self.internal_dim % num_heads == 0
	), "num_heads must divide embedding_dim."

	self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
	self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
	self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
	self.out_proj = nn.Linear(self.internal_dim, embedding_dim)

	self.dropout_p = dropout

	def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
	b, n, c = x.shape
	x = x.reshape(b, n, num_heads, c // num_heads).contiguous()
	return x.transpose(1, 2).contiguous() # B x N_heads x N_tokens x C_per_head

	def _recombine_heads(self, x: Tensor) -> Tensor:
	b, n_heads, n_tokens, c_per_head = x.shape
	x = x.transpose(1, 2).contiguous()
	return x.reshape(b, n_tokens, n_heads * c_per_head).contiguous() # B x N_tokens x C

	def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
	# Input projections
	q = self.q_proj(q)
	k = self.k_proj(k)
	v = self.v_proj(v)

	# Separate into heads
	q = self._separate_heads(q, self.num_heads)
	k = self._separate_heads(k, self.num_heads)
	v = self._separate_heads(v, self.num_heads)

	dropout_p = self.dropout_p if self.training else 0.0
	# Attention
	out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)

	out = self._recombine_heads(out)
	out = self.out_proj(out)

	return out

	def init_t_xy(end_x: int, end_y: int):
	t = torch.arange(end_x * end_y, dtype=torch.float32)
	t_x = (t % end_x).float()
	t_y = torch.div(t, end_x, rounding_mode="floor").float()
	return t_x, t_y

	def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
	freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
	freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))

	t_x, t_y = init_t_xy(end_x, end_y)
	freqs_x = torch.outer(t_x, freqs_x)
	freqs_y = torch.outer(t_y, freqs_y)
	freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
	freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
	return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)

	def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
	ndim = x.ndim
	assert 0 <= 1 < ndim
	assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
	shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
	return freqs_cis.view(*shape)

	def apply_rotary_enc(
	xq: torch.Tensor,
	xk: torch.Tensor,
	freqs_cis: torch.Tensor,
	repeat_freqs_k: bool = False,
	):
	xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
	xk_ = (
	torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
	if xk.shape[-2] != 0
	else None
	)
	freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
	xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
	if xk_ is None:
	# no keys to rotate, due to dropout
	return xq_out.type_as(xq).to(xq.device), xk
	# repeat freqs along seq_len dim to match k seq_len
	if repeat_freqs_k:
	r = xk_.shape[-2] // xq_.shape[-2]
	if freqs_cis.is_cuda:
	freqs_cis = freqs_cis.repeat(([1] (freqs_cis.ndim - 2)), r, 1)
	else:
	# torch.repeat on complex numbers may not be supported on non-CUDA devices
	# (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
	freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
	xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
	return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)

	class RoPEAttention(Attention):
	"""Attention with rotary position encoding."""

	def __init__(
	self,
	*args,
	rope_theta=10000.0,
	# whether to repeat q rope to match k length
	# this is needed for cross-attention to memories
	rope_k_repeat=False,
	feat_sizes=(64, 64), # [w, h] for stride 16 feats at 1024 resolution
	**kwargs,
	):
	super().__init__(args, *kwargs)

	self.compute_cis = partial(
	compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta
	)
	freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
	self.freqs_cis = (
	freqs_cis.to("cuda") if torch.cuda.is_available() else freqs_cis
	)
	self.rope_k_repeat = rope_k_repeat

	def forward(
	self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0
	) -> Tensor:
	# Input projections
	q = self.q_proj(q)
	k = self.k_proj(k)
	v = self.v_proj(v)

	# Separate into heads
	q = self._separate_heads(q, self.num_heads)
	k = self._separate_heads(k, self.num_heads)
	v = self._separate_heads(v, self.num_heads)

	# Apply rotary position encoding
	w = h = math.sqrt(q.shape[-2])
	self.freqs_cis = self.freqs_cis.to(q.device)
	if self.freqs_cis.shape[0] != q.shape[-2]:
	self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
	if q.shape[-2] != k.shape[-2]:
	assert self.rope_k_repeat

	num_k_rope = k.size(-2) - num_k_exclude_rope
	q, k[:, :, :num_k_rope] = apply_rotary_enc(
	q,
	k[:, :, :num_k_rope],
	freqs_cis=self.freqs_cis,
	repeat_freqs_k=self.rope_k_repeat,
	)

	dropout_p = self.dropout_p if self.training else 0.0
	# Attention
	out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)

	out = self._recombine_heads(out)
	out = self.out_proj(out)

	return out

	# a large negative value as a placeholder score for missing objects
	NO_OBJ_SCORE = -1024.0

	def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
	"""
	Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs`
	that are temporally closest to the current frame at `frame_idx`. Here, we take
	- a) the closest conditioning frame before `frame_idx` (if any);
	- b) the closest conditioning frame after `frame_idx` (if any);
	- c) any other temporally closest conditioning frames until reaching a total
	of `max_cond_frame_num` conditioning frames.

	Outputs:
	- selected_outputs: selected items (keys & values) from `cond_frame_outputs`.
	- unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`.
	"""
	if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
	selected_outputs = cond_frame_outputs
	unselected_outputs = {}
	else:
	assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
	selected_outputs = {}

	# the closest conditioning frame before `frame_idx` (if any)
	idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
	if idx_before is not None:
	selected_outputs[idx_before] = cond_frame_outputs[idx_before]

	# the closest conditioning frame after `frame_idx` (if any)
	idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
	if idx_after is not None:
	selected_outputs[idx_after] = cond_frame_outputs[idx_after]

	# add other temporally closest conditioning frames until reaching a total
	# of `max_cond_frame_num` conditioning frames.
	num_remain = max_cond_frame_num - len(selected_outputs)
	inds_remain = sorted(
	(t for t in cond_frame_outputs if t not in selected_outputs),
	key=lambda x: abs(x - frame_idx),
	)[:num_remain]
	selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
	unselected_outputs = {
	t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs
	}

	return selected_outputs, unselected_outputs


	def get_1d_sine_pe(pos_inds, dim, temperature=10000):
	"""
	Get 1D sine positional embedding as in the original Transformer paper.
	"""
	pe_dim = dim // 2
	dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
	dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)

	pos_embed = pos_inds.unsqueeze(-1) / dim_t
	pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
	return pos_embed

	class _SAM2Base(torch.nn.Module):
	def __init__(
	self,
	image_encoder,
	memory_attention,
	memory_encoder,
	num_maskmem=7, # default 1 input frame + 6 previous frames
	image_size=512,
	backbone_stride=16, # stride of the image backbone output
	sigmoid_scale_for_mem_enc=1.0, # scale factor for mask sigmoid prob
	sigmoid_bias_for_mem_enc=0.0, # bias factor for mask sigmoid prob
	# During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks
	binarize_mask_from_pts_for_mem_enc=False,
	use_mask_input_as_output_without_sam=False, # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder
	# The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit,
	# we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model
	# a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM.
	max_cond_frames_in_attn=-1,
	# on the first frame, whether to directly add the no-memory embedding to the image feature
	# (instead of using the transformer encoder)
	directly_add_no_mem_embed=False,
	# whether to use high-resolution feature maps in the SAM mask decoder
	use_high_res_features_in_sam=False,
	# whether to output multiple (3) masks for the first click on initial conditioning frames
	multimask_output_in_sam=False,
	# the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`;
	# default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points)
	multimask_min_pt_num=1,
	multimask_max_pt_num=1,
	# whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`)
	multimask_output_for_tracking=False,
	# Whether to use multimask tokens for obj ptr; Only relevant when both
	# use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True
	use_multimask_token_for_obj_ptr: bool = False,
	# whether to use sigmoid to restrict ious prediction to [0-1]
	iou_prediction_use_sigmoid=False,
	# The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5).
	# For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of
	# (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame.
	memory_temporal_stride_for_eval=1,
	# whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks)
	non_overlap_masks_for_mem_enc=False,
	# whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
	use_obj_ptrs_in_encoder=False,
	# the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`)
	max_obj_ptrs_in_encoder=16,
	# whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`)
	add_tpos_enc_to_obj_ptrs=True,
	# whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference
	# with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
	proj_tpos_enc_in_obj_ptrs=False,
	# whether to use signed distance (instead of unsigned absolute distance) in the temporal positional encoding in the object pointers
	# (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
	use_signed_tpos_enc_to_obj_ptrs=False,
	# whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation
	# (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking)
	only_obj_ptrs_in_the_past_for_eval=False,
	# Whether to predict if there is an object in the frame
	pred_obj_scores: bool = False,
	# Whether to use an MLP to predict object scores
	pred_obj_scores_mlp: bool = False,
	# Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True;
	# Whether to have a fixed no obj pointer when there is no object present
	# or to use it as an additive embedding with obj_ptr produced by decoder
	fixed_no_obj_ptr: bool = False,
	# Soft no object, i.e. mix in no_obj_ptr softly,
	# hope to make recovery easier if there is a mistake and mitigate accumulation of errors
	soft_no_obj_ptr: bool = False,
	use_mlp_for_obj_ptr_proj: bool = False,
	# add no obj embedding to spatial frames
	no_obj_embed_spatial: bool = False,
	# extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class.
	sam_mask_decoder_extra_args=None,
	compile_image_encoder: bool = False,
	):
	super().__init__()

	# Part 1: the image backbone
	self.image_encoder = image_encoder
	# Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
	self.use_high_res_features_in_sam = use_high_res_features_in_sam
	self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
	self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
	self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
	if use_obj_ptrs_in_encoder:
	# A conv layer to downsample the mask prompt to stride 4 (the same stride as
	# low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
	# so that it can be fed into the SAM mask decoder to generate a pointer.
	self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
	self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
	if proj_tpos_enc_in_obj_ptrs:
	assert add_tpos_enc_to_obj_ptrs # these options need to be used together
	self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
	self.use_signed_tpos_enc_to_obj_ptrs = use_signed_tpos_enc_to_obj_ptrs
	self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval

	# Part 2: memory attention to condition current frame's visual features
	# with memories (and obj ptrs) from past frames
	self.memory_attention = memory_attention
	self.hidden_dim = image_encoder.neck.d_model

	# Part 3: memory encoder for the previous frame's outputs
	self.memory_encoder = memory_encoder
	self.mem_dim = self.hidden_dim
	if hasattr(self.memory_encoder, "out_proj") and hasattr(
	self.memory_encoder.out_proj, "weight"
	):
	# if there is compression of memories along channel dim
	self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
	self.num_maskmem = num_maskmem # Number of memories accessible
	# Temporal encoding of the memories
	self.maskmem_tpos_enc = torch.nn.Parameter(
	torch.zeros(num_maskmem, 1, 1, self.mem_dim)
	)
	trunc_normal_(self.maskmem_tpos_enc, std=0.02)
	# a single token to indicate no memory embedding from previous frames
	self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
	self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
	trunc_normal_(self.no_mem_embed, std=0.02)
	trunc_normal_(self.no_mem_pos_enc, std=0.02)
	self.directly_add_no_mem_embed = directly_add_no_mem_embed
	# Apply sigmoid to the output raw mask logits (to turn them from
	# range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
	self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
	self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
	self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
	self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
	self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
	# On frames with mask input, whether to directly output the input mask without
	# using a SAM prompt encoder + mask decoder
	self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
	self.multimask_output_in_sam = multimask_output_in_sam
	self.multimask_min_pt_num = multimask_min_pt_num
	self.multimask_max_pt_num = multimask_max_pt_num
	self.multimask_output_for_tracking = multimask_output_for_tracking
	self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
	self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid

	# Part 4: SAM-style prompt encoder (for both mask and point inputs)
	# and SAM-style mask decoder for the final mask output
	self.image_size = image_size
	self.backbone_stride = backbone_stride
	self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
	self.pred_obj_scores = pred_obj_scores
	self.pred_obj_scores_mlp = pred_obj_scores_mlp
	self.fixed_no_obj_ptr = fixed_no_obj_ptr
	self.soft_no_obj_ptr = soft_no_obj_ptr
	if self.fixed_no_obj_ptr:
	assert self.pred_obj_scores
	assert self.use_obj_ptrs_in_encoder
	if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
	self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
	trunc_normal_(self.no_obj_ptr, std=0.02)
	self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
	self.no_obj_embed_spatial = None
	if no_obj_embed_spatial:
	self.no_obj_embed_spatial = torch.nn.Parameter(torch.zeros(1, self.mem_dim))
	trunc_normal_(self.no_obj_embed_spatial, std=0.02)

	self._build_sam_heads()
	self.max_cond_frames_in_attn = max_cond_frames_in_attn

	# Model compilation
	if compile_image_encoder:
	# Compile the forward function (not the full module) to allow loading checkpoints.
	print(
	"Image encoder compilation is enabled. First forward pass will be slow."
	)
	self.image_encoder.forward = torch.compile(
	self.image_encoder.forward,
	mode="max-autotune",
	fullgraph=True,
	dynamic=False,
	)

	@property
	def device(self):
	return next(self.parameters()).device

	def forward(self, args, *kwargs):
	raise NotImplementedError(
	"Please use the corresponding methods in SAM2VideoPredictor for inference or SAM2Train for training/fine-tuning"
	"See notebooks/video_predictor_example.ipynb for an inference example."
	)

	def _build_sam_heads(self):
	"""Build SAM-style prompt encoder and mask decoder."""
	self.sam_prompt_embed_dim = self.hidden_dim
	self.sam_image_embedding_size = self.image_size // self.backbone_stride

	# build PromptEncoder and MaskDecoder from SAM
	# (their hyperparameters like `mask_in_chans=16` are from SAM code)
	self.sam_prompt_encoder = PromptEncoder(
	embed_dim=self.sam_prompt_embed_dim,
	image_embedding_size=(
	self.sam_image_embedding_size,
	self.sam_image_embedding_size,
	),
	input_image_size=(self.image_size, self.image_size),
	mask_in_chans=16,
	)
	self.sam_mask_decoder = MaskDecoder(
	num_multimask_outputs=3,
	transformer=TwoWayTransformer(
	depth=2,
	embedding_dim=self.sam_prompt_embed_dim,
	mlp_dim=2048,
	num_heads=8,
	),
	transformer_dim=self.sam_prompt_embed_dim,
	iou_head_depth=3,
	iou_head_hidden_dim=256,
	use_high_res_features=self.use_high_res_features_in_sam,
	iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
	pred_obj_scores=self.pred_obj_scores,
	pred_obj_scores_mlp=self.pred_obj_scores_mlp,
	use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
	**(self.sam_mask_decoder_extra_args or {}),
	)
	if self.use_obj_ptrs_in_encoder:
	# a linear projection on SAM output tokens to turn them into object pointers
	self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
	if self.use_mlp_for_obj_ptr_proj:
	self.obj_ptr_proj = MLP(
	self.hidden_dim, self.hidden_dim, self.hidden_dim, 3
	)
	else:
	self.obj_ptr_proj = torch.nn.Identity()
	if self.proj_tpos_enc_in_obj_ptrs:
	# a linear projection on temporal positional encoding in object pointers to
	# avoid potential interference with spatial positional encoding
	self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
	else:
	self.obj_ptr_tpos_proj = torch.nn.Identity()

	def _forward_sam_heads(
	self,
	backbone_features,
	point_inputs=None,
	mask_inputs=None,
	high_res_features=None,
	multimask_output=False,
	):
	"""
	Forward SAM prompt encoders and mask heads.

	Inputs:
	- backbone_features: image features of [B, C, H, W] shape
	- point_inputs: a dictionary with "point_coords" and "point_labels", where
	1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
	absolute pixel-unit coordinate in (x, y) format of the P input points
	2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
	positive clicks, 0 means negative clicks, and -1 means padding
	- mask_inputs: a mask of [B, 1, H16, W16] shape, float or bool, with the
	same spatial size as the image.
	- high_res_features: either 1) None or 2) or a list of length 2 containing
	two feature maps of [B, C, 4H, 4W] and [B, C, 2H, 2W] shapes respectively,
	which will be used as high-resolution feature maps for SAM decoder.
	- multimask_output: if it's True, we output 3 candidate masks and their 3
	corresponding IoU estimates, and if it's False, we output only 1 mask and
	its corresponding IoU estimate.

	Outputs:
	- low_res_multimasks: [B, M, H4, W4] shape (where M = 3 if
	`multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
	output mask logits (before sigmoid) for the low-resolution masks, with 4x
	the resolution (1/4 stride) of the input backbone_features.
	- high_res_multimasks: [B, M, H16, W16] shape (where M = 3
	if `multimask_output=True` and M = 1 if `multimask_output=False`),
	upsampled from the low-resolution masks, with shape size as the image
	(stride is 1 pixel).
	- ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
	if `multimask_output=False`), the estimated IoU of each output mask.
	- low_res_masks: [B, 1, H4, W4] shape, the best mask in `low_res_multimasks`.
	If `multimask_output=True`, it's the mask with the highest IoU estimate.
	If `multimask_output=False`, it's the same as `low_res_multimasks`.
	- high_res_masks: [B, 1, H16, W16] shape, the best mask in `high_res_multimasks`.
	If `multimask_output=True`, it's the mask with the highest IoU estimate.
	If `multimask_output=False`, it's the same as `high_res_multimasks`.
	- obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
	based on the output token from the SAM mask decoder.
	"""
	B = backbone_features.size(0)
	device = backbone_features.device
	assert backbone_features.size(1) == self.sam_prompt_embed_dim
	assert backbone_features.size(2) == self.sam_image_embedding_size
	assert backbone_features.size(3) == self.sam_image_embedding_size

	# a) Handle point prompts
	if point_inputs is not None:
	sam_point_coords = point_inputs["point_coords"]
	sam_point_labels = point_inputs["point_labels"]
	assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
	else:
	# If no points are provide, pad with an empty point (with label -1)
	sam_point_coords = torch.zeros(B, 1, 2, device=device)
	sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)

	# b) Handle mask prompts
	if mask_inputs is not None:
	# If mask_inputs is provided, downsize it into low-res mask input if needed
	# and feed it as a dense mask prompt into the SAM mask encoder
	assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
	if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
	sam_mask_prompt = F.interpolate(
	mask_inputs.float(),
	size=self.sam_prompt_encoder.mask_input_size,
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	else:
	sam_mask_prompt = mask_inputs
	else:
	# Otherwise, simply feed None (and SAM's prompt encoder will add
	# a learned `no_mask_embed` to indicate no mask input in this case).
	sam_mask_prompt = None

	sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
	points=(sam_point_coords, sam_point_labels),
	boxes=None,
	masks=sam_mask_prompt,
	)
	(
	low_res_multimasks,
	ious,
	sam_output_tokens,
	object_score_logits,
	) = self.sam_mask_decoder(
	image_embeddings=backbone_features,
	image_pe=self.sam_prompt_encoder.get_dense_pe(),
	sparse_prompt_embeddings=sparse_embeddings,
	dense_prompt_embeddings=dense_embeddings,
	multimask_output=multimask_output,
	repeat_image=False, # the image is already batched
	high_res_features=high_res_features,
	)
	if self.pred_obj_scores:
	is_obj_appearing = object_score_logits > 0

	# Mask used for spatial memories is always a hard choice between obj and no obj,
	# consistent with the actual mask prediction
	low_res_multimasks = torch.where(
	is_obj_appearing[:, None, None],
	low_res_multimasks,
	NO_OBJ_SCORE,
	)

	# convert masks from possibly bfloat16 (or float16) to float32
	# (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
	low_res_multimasks = low_res_multimasks.float()
	high_res_multimasks = F.interpolate(
	low_res_multimasks,
	size=(self.image_size, self.image_size),
	mode="bilinear",
	align_corners=False,
	)

	sam_output_token = sam_output_tokens[:, 0]
	if multimask_output:
	# take the best mask prediction (with the highest IoU estimation)
	best_iou_inds = torch.argmax(ious, dim=-1)
	batch_inds = torch.arange(B, device=device)
	low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	if sam_output_tokens.size(1) > 1:
	sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
	else:
	low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks

	# Extract object pointer from the SAM output token (with occlusion handling)
	obj_ptr = self.obj_ptr_proj(sam_output_token)
	if self.pred_obj_scores:
	# Allow soft no obj ptr, unlike for masks
	if self.soft_no_obj_ptr:
	lambda_is_obj_appearing = object_score_logits.sigmoid()
	else:
	lambda_is_obj_appearing = is_obj_appearing.float()

	if self.fixed_no_obj_ptr:
	obj_ptr = lambda_is_obj_appearing * obj_ptr
	obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr

	return (
	low_res_multimasks,
	high_res_multimasks,
	ious,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	object_score_logits,
	)

	def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
	"""
	Directly turn binary `mask_inputs` into a output mask logits without using SAM.
	(same input and output shapes as in _forward_sam_heads above).
	"""
	# Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
	out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05
	mask_inputs_float = mask_inputs.float()
	high_res_masks = mask_inputs_float * out_scale + out_bias
	low_res_masks = F.interpolate(
	high_res_masks,
	size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	# a dummy IoU prediction of all 1's under mask input
	ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
	if not self.use_obj_ptrs_in_encoder:
	# all zeros as a dummy object pointer (of shape [B, C])
	obj_ptr = torch.zeros(
	mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device
	)
	else:
	# produce an object pointer using the SAM decoder from the mask input
	_, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
	backbone_features=backbone_features,
	mask_inputs=self.mask_downsample(mask_inputs_float),
	high_res_features=high_res_features,
	)
	# In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
	# Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
	# on the object_scores from the SAM decoder.
	is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
	is_obj_appearing = is_obj_appearing[..., None]
	lambda_is_obj_appearing = is_obj_appearing.float()
	object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
	if self.pred_obj_scores:
	if self.fixed_no_obj_ptr:
	obj_ptr = lambda_is_obj_appearing * obj_ptr
	obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr

	return (
	low_res_masks,
	high_res_masks,
	ious,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	object_score_logits,
	)

	def forward_image(self, img_batch: torch.Tensor):
	"""Get the image feature on the input batch."""
	backbone_out = self.image_encoder(img_batch)
	if self.use_high_res_features_in_sam:
	# precompute projected level 0 and level 1 features in SAM decoder
	# to avoid running it again on every SAM click
	backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
	backbone_out["backbone_fpn"][0]
	)
	backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
	backbone_out["backbone_fpn"][1]
	)
	return backbone_out

	def _prepare_backbone_features(self, backbone_out):
	"""Prepare and flatten visual features."""
	backbone_out = backbone_out.copy()
	assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
	assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels

	feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
	vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]

	feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
	# flatten NxCxHxW to HWxNxC
	vision_feats = [x.flatten(2).permute(2, 0, 1).contiguous() for x in feature_maps]
	vision_pos_embeds = [x.flatten(2).permute(2, 0, 1).contiguous() for x in vision_pos_embeds]

	return backbone_out, vision_feats, vision_pos_embeds, feat_sizes

	def _prepare_memory_conditioned_features(
	self,
	frame_idx,
	is_init_cond_frame,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	output_dict,
	num_frames,
	track_in_reverse=False, # tracking in reverse time order (for demo usage)
	):
	"""Fuse the current frame's visual feature map with previous memory."""
	B = current_vision_feats[-1].size(1) # batch size on this frame
	C = self.hidden_dim
	H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
	device = current_vision_feats[-1].device
	# The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
	# In this case, we skip the fusion with any memory.
	if self.num_maskmem == 0: # Disable memory and skip fusion
	pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W).contiguous()
	return pix_feat

	num_obj_ptr_tokens = 0
	tpos_sign_mul = -1 if track_in_reverse else 1
	# Step 1: condition the visual features of the current frame on previous memories
	if not is_init_cond_frame:
	# Retrieve the memories encoded with the maskmem backbone
	to_cat_memory, to_cat_memory_pos_embed = [], []
	# Add conditioning frames's output first (all cond frames have t_pos=0 for
	# when getting temporal positional embedding below)
	assert len(output_dict["cond_frame_outputs"]) > 0
	# Select a maximum number of temporally closest cond frames for cross attention
	cond_outputs = output_dict["cond_frame_outputs"]
	selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
	frame_idx, cond_outputs, self.max_cond_frames_in_attn
	)
	t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
	# Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
	# the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
	# We also allow taking the memory frame non-consecutively (with stride>1), in which case
	# we take (self.num_maskmem - 2) frames among every stride-th frames plus the last frame.
	stride = 1 if self.training else self.memory_temporal_stride_for_eval
	for t_pos in range(1, self.num_maskmem):
	t_rel = self.num_maskmem - t_pos # how many frames before current frame
	if t_rel == 1:
	# for t_rel == 1, we take the last frame (regardless of r)
	if not track_in_reverse:
	# the frame immediately before this frame (i.e. frame_idx - 1)
	prev_frame_idx = frame_idx - t_rel
	else:
	# the frame immediately after this frame (i.e. frame_idx + 1)
	prev_frame_idx = frame_idx + t_rel
	else:
	# for t_rel >= 2, we take the memory frame from every r-th frames
	if not track_in_reverse:
	# first find the nearest frame among every r-th frames before this frame
	# for r=1, this would be (frame_idx - 2)
	prev_frame_idx = ((frame_idx - 2) // stride) * stride
	# then seek further among every r-th frames
	prev_frame_idx = prev_frame_idx - (t_rel - 2) * stride
	else:
	# first find the nearest frame among every r-th frames after this frame
	# for r=1, this would be (frame_idx + 2)
	prev_frame_idx = -(-(frame_idx + 2) // stride) * stride
	# then seek further among every r-th frames
	prev_frame_idx = prev_frame_idx + (t_rel - 2) * stride
	out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
	if out is None:
	# If an unselected conditioning frame is among the last (self.num_maskmem - 1)
	# frames, we still attend to it as if it's a non-conditioning frame.
	out = unselected_cond_outputs.get(prev_frame_idx, None)
	t_pos_and_prevs.append((t_pos, out))

	for t_pos, prev in t_pos_and_prevs:
	if prev is None:
	continue # skip padding frames
	# "maskmem_features" might have been offloaded to CPU in demo use cases,
	# so we load it back to GPU (it's a no-op if it's already on GPU).
	feats = prev["maskmem_features"].to(device, non_blocking=True)
	to_cat_memory.append(feats.flatten(2).permute(2, 0, 1).contiguous())
	# Spatial positional encoding (it might have been offloaded to CPU in eval)
	maskmem_enc = prev["maskmem_pos_enc"][-1].to(device)
	maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1).contiguous()
	# Temporal positional encoding
	maskmem_enc = (
	maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
	)
	to_cat_memory_pos_embed.append(maskmem_enc)

	# Construct the list of past object pointers
	if self.use_obj_ptrs_in_encoder:
	max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
	# First add those object pointers from selected conditioning frames
	# (optionally, only include object pointers in the past during evaluation)
	if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
	ptr_cond_outputs = {
	t: out
	for t, out in selected_cond_outputs.items()
	if (t >= frame_idx if track_in_reverse else t <= frame_idx)
	}
	else:
	ptr_cond_outputs = selected_cond_outputs
	pos_and_ptrs = [
	# Temporal pos encoding contains how far away each pointer is from current frame
	(
	(
	(frame_idx - t) * tpos_sign_mul
	if self.use_signed_tpos_enc_to_obj_ptrs
	else abs(frame_idx - t)
	),
	out["obj_ptr"],
	)
	for t, out in ptr_cond_outputs.items()
	]
	# Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
	for t_diff in range(1, max_obj_ptrs_in_encoder):
	t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
	if t < 0 or (num_frames is not None and t >= num_frames):
	break
	out = output_dict["non_cond_frame_outputs"].get(
	t, unselected_cond_outputs.get(t, None)
	)
	if out is not None:
	pos_and_ptrs.append((t_diff, out["obj_ptr"]))
	# If we have at least one object pointer, add them to the across attention
	if len(pos_and_ptrs) > 0:
	pos_list, ptrs_list = zip(*pos_and_ptrs)
	# stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
	obj_ptrs = torch.stack(ptrs_list, dim=0)
	# a temporal positional embedding based on how far each object pointer is from
	# the current frame (sine embedding normalized by the max pointer num).
	if self.add_tpos_enc_to_obj_ptrs:
	t_diff_max = max_obj_ptrs_in_encoder - 1
	tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
	obj_pos = torch.tensor(pos_list).to(
	device=device, non_blocking=True
	)
	obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
	obj_pos = self.obj_ptr_tpos_proj(obj_pos)
	obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
	else:
	obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
	if self.mem_dim < C:
	# split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
	obj_ptrs = obj_ptrs.reshape(
	-1, B, C // self.mem_dim, self.mem_dim
	).contiguous()
	obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1).contiguous()
	obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
	to_cat_memory.append(obj_ptrs)
	to_cat_memory_pos_embed.append(obj_pos)
	num_obj_ptr_tokens = obj_ptrs.shape[0]
	else:
	num_obj_ptr_tokens = 0
	else:
	# for initial conditioning frames, encode them without using any previous memory
	if self.directly_add_no_mem_embed:
	# directly add no-mem embedding (instead of using the transformer encoder)
	pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
	pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W).contiguous()
	return pix_feat_with_mem

	# Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
	to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
	to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]

	# Step 2: Concatenate the memories and forward through the transformer encoder
	memory = torch.cat(to_cat_memory, dim=0)
	memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)

	pix_feat_with_mem = self.memory_attention(
	curr=current_vision_feats,
	curr_pos=current_vision_pos_embeds,
	memory=memory,
	memory_pos=memory_pos_embed,
	num_obj_ptr_tokens=num_obj_ptr_tokens,
	)
	# reshape the output (HW)BC => BCHW
	pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W).contiguous()
	return pix_feat_with_mem

	def _encode_new_memory(
	self,
	current_vision_feats,
	feat_sizes,
	pred_masks_high_res,
	object_score_logits,
	is_mask_from_pts,
	):
	"""Encode the current image and its prediction into a memory feature."""
	B = current_vision_feats[-1].size(1) # batch size on this frame
	C = self.hidden_dim
	H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
	# top-level feature, (HW)BC => BCHW
	pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W).contiguous()
	if self.non_overlap_masks_for_mem_enc and not self.training:
	# optionally, apply non-overlapping constraints to the masks (it's applied
	# in the batch dimension and should only be used during eval, where all
	# the objects come from the same video under batch size 1).
	pred_masks_high_res = self._apply_non_overlapping_constraints(
	pred_masks_high_res
	)
	# scale the raw mask logits with a temperature before applying sigmoid
	binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
	if binarize and not self.training:
	mask_for_mem = (pred_masks_high_res > 0).float()
	else:
	# apply sigmoid on the raw mask logits to turn them into range (0, 1)
	mask_for_mem = torch.sigmoid(pred_masks_high_res)
	# apply scale and bias terms to the sigmoid probabilities
	if self.sigmoid_scale_for_mem_enc != 1.0:
	mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
	if self.sigmoid_bias_for_mem_enc != 0.0:
	mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
	maskmem_out = self.memory_encoder(
	pix_feat, mask_for_mem, skip_mask_sigmoid=True # sigmoid already applied
	)
	maskmem_features = maskmem_out["vision_features"]
	maskmem_pos_enc = maskmem_out["vision_pos_enc"]
	# add a no-object embedding to the spatial memory to indicate that the frame
	# is predicted to be occluded (i.e. no object is appearing in the frame)
	if self.no_obj_embed_spatial is not None:
	is_obj_appearing = (object_score_logits > 0).float()
	maskmem_features += (
	1 - is_obj_appearing[..., None, None]
	) * self.no_obj_embed_spatial[..., None, None].expand(
	*maskmem_features.shape
	)

	return maskmem_features, maskmem_pos_enc

	def _track_step(
	self,
	frame_idx,
	is_init_cond_frame,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	point_inputs,
	mask_inputs,
	output_dict,
	num_frames,
	track_in_reverse,
	prev_sam_mask_logits,
	):
	current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
	# High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
	if len(current_vision_feats) > 1:
	high_res_features = [
	x.permute(1, 2, 0).view(x.size(1), x.size(2), *s).contiguous()
	for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
	]
	else:
	high_res_features = None
	if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
	# When use_mask_input_as_output_without_sam=True, we directly output the mask input
	# (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
	pix_feat = current_vision_feats[-1].permute(1, 2, 0).contiguous()
	pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1]).contiguous()
	sam_outputs = self._use_mask_as_output(
	pix_feat, high_res_features, mask_inputs
	)
	else:
	# fused the visual feature with previous memory features in the memory bank
	pix_feat = self._prepare_memory_conditioned_features(
	frame_idx=frame_idx,
	is_init_cond_frame=is_init_cond_frame,
	current_vision_feats=current_vision_feats[-1:],
	current_vision_pos_embeds=current_vision_pos_embeds[-1:],
	feat_sizes=feat_sizes[-1:],
	output_dict=output_dict,
	num_frames=num_frames,
	track_in_reverse=track_in_reverse,
	)
	# apply SAM-style segmentation head
	# here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
	# e.g. in demo where such logits come from earlier interaction instead of correction sampling
	# (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
	if prev_sam_mask_logits is not None:
	assert point_inputs is not None and mask_inputs is None
	mask_inputs = prev_sam_mask_logits
	multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
	sam_outputs = self._forward_sam_heads(
	backbone_features=pix_feat,
	point_inputs=point_inputs,
	mask_inputs=mask_inputs,
	high_res_features=high_res_features,
	multimask_output=multimask_output,
	)

	return current_out, sam_outputs, high_res_features, pix_feat

	def _encode_memory_in_output(
	self,
	current_vision_feats,
	feat_sizes,
	point_inputs,
	run_mem_encoder,
	high_res_masks,
	object_score_logits,
	current_out,
	):
	if run_mem_encoder and self.num_maskmem > 0:
	high_res_masks_for_mem_enc = high_res_masks
	maskmem_features, maskmem_pos_enc = self._encode_new_memory(
	current_vision_feats=current_vision_feats,
	feat_sizes=feat_sizes,
	pred_masks_high_res=high_res_masks_for_mem_enc,
	object_score_logits=object_score_logits,
	is_mask_from_pts=(point_inputs is not None),
	)
	current_out["maskmem_features"] = maskmem_features
	current_out["maskmem_pos_enc"] = maskmem_pos_enc
	else:
	current_out["maskmem_features"] = None
	current_out["maskmem_pos_enc"] = None

	def track_step(
	self,
	frame_idx,
	is_init_cond_frame,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	point_inputs,
	mask_inputs,
	output_dict,
	num_frames,
	track_in_reverse=False, # tracking in reverse time order (for demo usage)
	# Whether to run the memory encoder on the predicted masks. Sometimes we might want
	# to skip the memory encoder with `run_mem_encoder=False`. For example,
	# in demo we might call `track_step` multiple times for each user click,
	# and only encode the memory when the user finalizes their clicks. And in ablation
	# settings like SAM training on static images, we don't need the memory encoder.
	run_mem_encoder=True,
	# The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
	prev_sam_mask_logits=None,
	):
	current_out, sam_outputs, _, _ = self._track_step(
	frame_idx,
	is_init_cond_frame,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	point_inputs,
	mask_inputs,
	output_dict,
	num_frames,
	track_in_reverse,
	prev_sam_mask_logits,
	)

	(
	_,
	_,
	_,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	object_score_logits,
	) = sam_outputs

	current_out["pred_masks"] = low_res_masks
	current_out["pred_masks_high_res"] = high_res_masks
	current_out["obj_ptr"] = obj_ptr
	if not self.training:
	# Only add this in inference (to avoid unused param in activation checkpointing;
	# it's mainly used in the demo to encode spatial memories w/ consolidated masks)
	current_out["object_score_logits"] = object_score_logits

	# Finally run the memory encoder on the predicted mask to encode
	# it into a new memory feature (that can be used in future frames)
	self._encode_memory_in_output(
	current_vision_feats,
	feat_sizes,
	point_inputs,
	run_mem_encoder,
	high_res_masks,
	object_score_logits,
	current_out,
	)

	return current_out

	def _use_multimask(self, is_init_cond_frame, point_inputs):
	"""Whether to use multimask output in the SAM head."""
	num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
	multimask_output = (
	self.multimask_output_in_sam
	and (is_init_cond_frame or self.multimask_output_for_tracking)
	and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
	)
	return multimask_output

	def _apply_non_overlapping_constraints(self, pred_masks):
	"""
	Apply non-overlapping constraints to the object scores in pred_masks. Here we
	keep only the highest scoring object at each spatial location in pred_masks.
	"""
	batch_size = pred_masks.size(0)
	if batch_size == 1:
	return pred_masks

	device = pred_masks.device
	# "max_obj_inds": object index of the object with the highest score at each location
	max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
	# "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
	batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
	keep = max_obj_inds == batch_obj_inds
	# suppress overlapping regions' scores below -10.0 so that the foreground regions
	# don't overlap (here sigmoid(-10.0)=4.5398e-05)
	pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
	return pred_masks


	class MaskEncoder(nn.Module):
	def __init__(
	self,
	*,
	transformer_dim: int,
	transformer: nn.Module,
	num_mask_tokens: int = 4,
	) -> None:
	"""
	Predicts masks given an image and prompt embeddings, using a
	transformer architecture.

	Arguments:
	transformer_dim (int): the channel dimension of the transformer
	transformer (nn.Module): the transformer used to predict masks
	num_multimask_outputs (int): the number of masks to predict
	when disambiguating masks
	activation (nn.Module): the type of activation to use when
	upscaling masks
	iou_head_depth (int): the depth of the MLP used to predict
	mask quality
	iou_head_hidden_dim (int): the hidden dimension of the MLP
	used to predict mask quality
	"""
	super().__init__()
	self.transformer_dim = transformer_dim
	self.transformer = transformer

	self.mask_tokens = nn.Embedding(num_mask_tokens, transformer_dim)
	self.num_mask_tokens = num_mask_tokens

	def forward(
	self,
	image_embeddings: torch.Tensor,
	image_pe: torch.Tensor,
	sparse_prompt_embeddings: torch.Tensor,
	dense_prompt_embeddings: torch.Tensor,
	repeat_image: bool,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Predict masks given image and prompt embeddings.

	Arguments:
	image_embeddings (torch.Tensor): the embeddings from the image encoder
	image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
	sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
	dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
	multimask_output (bool): Whether to return multiple masks or a single
	mask.

	Returns:
	torch.Tensor: batched predicted masks
	torch.Tensor: batched predictions of mask quality
	torch.Tensor: batched SAM token for mask output
	"""
	return self.predict_masks(
	image_embeddings=image_embeddings,
	image_pe=image_pe,
	sparse_prompt_embeddings=sparse_prompt_embeddings,
	dense_prompt_embeddings=dense_prompt_embeddings,
	repeat_image=repeat_image,
	)

	def predict_masks(
	self,
	image_embeddings: torch.Tensor,
	image_pe: torch.Tensor,
	sparse_prompt_embeddings: torch.Tensor,
	dense_prompt_embeddings: torch.Tensor,
	repeat_image: bool,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Predicts masks. See 'forward' for more details."""
	# Concatenate output tokens
	s = 0
	output_tokens = self.mask_tokens.weight
	output_tokens = output_tokens.unsqueeze(0).expand(
	sparse_prompt_embeddings.size(0), -1, -1
	)
	tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) # tokens = output_tokens

	# Expand per-image data in batch direction to be per-mask
	if repeat_image:
	src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
	else:
	assert image_embeddings.shape[0] == tokens.shape[0]
	src = image_embeddings
	src = src + dense_prompt_embeddings
	assert (
	image_pe.size(0) == 1
	), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
	pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
	b, c, h, w = src.shape

	# Run the transformer
	hs, src = self.transformer(src, pos_src, tokens)
	s = 0
	mask_tokens_out = hs[:, s:s+self.num_mask_tokens, :]

	return mask_tokens_out


	class SAM2Base(_SAM2Base):

	def _build_sam_heads(self):
	"""Build SAM-style prompt encoder and mask decoder."""
	self.sam_prompt_embed_dim = self.hidden_dim
	self.sam_image_embedding_size = self.image_size // self.backbone_stride

	# build PromptEncoder and MaskDecoder from SAM
	# (their hyperparameters like `mask_in_chans=16` are from SAM code)
	self.sam_prompt_encoder = PromptEncoder(
	embed_dim=self.sam_prompt_embed_dim,
	image_embedding_size=(
	self.sam_image_embedding_size,
	self.sam_image_embedding_size,
	),
	input_image_size=(self.image_size, self.image_size),
	mask_in_chans=16,
	)
	self.sam_mask_decoder = MaskDecoder(
	num_multimask_outputs=3,
	transformer=TwoWayTransformer(
	depth=2,
	embedding_dim=self.sam_prompt_embed_dim,
	mlp_dim=2048,
	num_heads=8,
	),
	transformer_dim=self.sam_prompt_embed_dim,
	iou_head_depth=3,
	iou_head_hidden_dim=256,
	use_high_res_features=self.use_high_res_features_in_sam,
	iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
	pred_obj_scores=self.pred_obj_scores,
	pred_obj_scores_mlp=self.pred_obj_scores_mlp,
	use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
	**(self.sam_mask_decoder_extra_args or {}),
	)
	self.sam_mask_encoder = MaskEncoder(
	transformer=TwoWayTransformer(
	depth=2,
	embedding_dim=self.sam_prompt_embed_dim,
	mlp_dim=2048,
	num_heads=8
	),
	transformer_dim=self.sam_prompt_embed_dim,
	num_mask_tokens=int(os.environ.get("MASK_TOKENIZER_NUM_MASK_TOKEN", 1)),
	)
	if self.use_obj_ptrs_in_encoder:
	# a linear projection on SAM output tokens to turn them into object pointers
	self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
	if self.use_mlp_for_obj_ptr_proj:
	self.obj_ptr_proj = MLP(
	self.hidden_dim, self.hidden_dim, self.hidden_dim, 3
	)
	else:
	self.obj_ptr_proj = torch.nn.Identity()
	if self.proj_tpos_enc_in_obj_ptrs:
	# a linear projection on temporal positional encoding in object pointers to
	# avoid potential interference with spatial positional encoding
	self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
	else:
	self.obj_ptr_tpos_proj = torch.nn.Identity()


	def track_step(
	self,
	frame_idx,
	is_init_cond_frame,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	point_inputs,
	mask_inputs,
	output_dict,
	num_frames,
	track_in_reverse=False, # tracking in reverse time order (for demo usage)
	# Whether to run the memory encoder on the predicted masks. Sometimes we might want
	# to skip the memory encoder with `run_mem_encoder=False`. For example,
	# in demo we might call `track_step` multiple times for each user click,
	# and only encode the memory when the user finalizes their clicks. And in ablation
	# settings like SAM training on static images, we don't need the memory encoder.
	run_mem_encoder=True,
	# The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
	prev_sam_mask_logits=None,
	## Extension: LLM prompt
	language_embed=None,
	):
	current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
	# High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
	if len(current_vision_feats) > 1:
	high_res_features = [
	x.permute(1, 2, 0).view(x.size(1), x.size(2), *s).contiguous()
	for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
	]
	else:
	high_res_features = None
	if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
	# When use_mask_input_as_output_without_sam=True, we directly output the mask input
	# (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
	pix_feat = current_vision_feats[-1].permute(1, 2, 0).contiguous()
	pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1]).contiguous()
	sam_outputs = self._use_mask_as_output(
	pix_feat, high_res_features, mask_inputs
	)
	else:
	# fused the visual feature with previous memory features in the memory bank
	pix_feat_with_mem = self._prepare_memory_conditioned_features(
	frame_idx=frame_idx,
	is_init_cond_frame=is_init_cond_frame,
	current_vision_feats=current_vision_feats[-1:],
	current_vision_pos_embeds=current_vision_pos_embeds[-1:],
	feat_sizes=feat_sizes[-1:],
	output_dict=output_dict,
	num_frames=num_frames,
	track_in_reverse=track_in_reverse,
	)
	# apply SAM-style segmentation head
	# here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
	# e.g. in demo where such logits come from earlier interaction instead of correction sampling
	# (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
	if prev_sam_mask_logits is not None:
	assert point_inputs is not None and mask_inputs is None
	mask_inputs = prev_sam_mask_logits
	multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
	sam_outputs = self._forward_sam_heads(
	backbone_features=pix_feat_with_mem,
	point_inputs=point_inputs,
	mask_inputs=mask_inputs,
	high_res_features=high_res_features,
	multimask_output=multimask_output,
	# Inject language Embed if possible
	language_embed=language_embed,
	)
	(
	_,
	_,
	_,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	_,
	) = sam_outputs

	current_out["pred_masks"] = low_res_masks
	current_out["pred_masks_high_res"] = high_res_masks
	current_out["obj_ptr"] = obj_ptr

	# Finally run the memory encoder on the predicted mask to encode
	# it into a new memory feature (that can be used in future frames)
	if run_mem_encoder and self.num_maskmem > 0:
	high_res_masks_for_mem_enc = high_res_masks
	maskmem_features, maskmem_pos_enc = self._encode_new_memory(
	current_vision_feats=current_vision_feats,
	feat_sizes=feat_sizes,
	pred_masks_high_res=high_res_masks_for_mem_enc,
	is_mask_from_pts=(point_inputs is not None),
	)
	current_out["maskmem_features"] = maskmem_features
	current_out["maskmem_pos_enc"] = maskmem_pos_enc
	else:
	current_out["maskmem_features"] = None
	current_out["maskmem_pos_enc"] = None

	return current_out


	def _forward_sam_heads(
	self,
	backbone_features,
	point_inputs=None,
	mask_inputs=None,
	high_res_features=None,
	multimask_output=False,
	## Extension: LLM prompt
	language_embed=None,
	):
	"""
	Forward SAM prompt encoders and mask heads.

	Inputs:
	- backbone_features: image features of [B, C, H, W] shape
	- point_inputs: a dictionary with "point_coords" and "point_labels", where
	1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
	absolute pixel-unit coordinate in (x, y) format of the P input points
	2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
	positive clicks, 0 means negative clicks, and -1 means padding
	- mask_inputs: a mask of [B, 1, H16, W16] shape, float or bool, with the
	same spatial size as the image.
	- high_res_features: either 1) None or 2) or a list of length 2 containing
	two feature maps of [B, C, 4H, 4W] and [B, C, 2H, 2W] shapes respectively,
	which will be used as high-resolution feature maps for SAM decoder.
	- multimask_output: if it's True, we output 3 candidate masks and their 3
	corresponding IoU estimates, and if it's False, we output only 1 mask and
	its corresponding IoU estimate.

	Outputs:
	- low_res_multimasks: [B, M, H4, W4] shape (where M = 3 if
	`multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
	output mask logits (before sigmoid) for the low-resolution masks, with 4x
	the resolution (1/4 stride) of the input backbone_features.
	- high_res_multimasks: [B, M, H16, W16] shape (where M = 3
	if `multimask_output=True` and M = 1 if `multimask_output=False`),
	upsampled from the low-resolution masks, with shape size as the image
	(stride is 1 pixel).
	- ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
	if `multimask_output=False`), the estimated IoU of each output mask.
	- low_res_masks: [B, 1, H4, W4] shape, the best mask in `low_res_multimasks`.
	If `multimask_output=True`, it's the mask with the highest IoU estimate.
	If `multimask_output=False`, it's the same as `low_res_multimasks`.
	- high_res_masks: [B, 1, H16, W16] shape, the best mask in `high_res_multimasks`.
	If `multimask_output=True`, it's the mask with the highest IoU estimate.
	If `multimask_output=False`, it's the same as `high_res_multimasks`.
	- obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
	based on the output token from the SAM mask decoder.
	"""
	B = backbone_features.size(0)
	device = backbone_features.device
	assert backbone_features.size(1) == self.sam_prompt_embed_dim
	assert backbone_features.size(2) == self.sam_image_embedding_size
	assert backbone_features.size(3) == self.sam_image_embedding_size

	# a) Handle point prompts
	if point_inputs is not None:
	sam_point_coords = point_inputs["point_coords"]
	sam_point_labels = point_inputs["point_labels"]
	assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
	else:
	# If no points are provide, pad with an empty point (with label -1)
	sam_point_coords = torch.zeros(B, 1, 2, device=device)
	sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)

	# b) Handle mask prompts
	if mask_inputs is not None:
	# If mask_inputs is provided, downsize it into low-res mask input if needed
	# and feed it as a dense mask prompt into the SAM mask encoder
	assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
	if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
	sam_mask_prompt = F.interpolate(
	mask_inputs.float(),
	size=self.sam_prompt_encoder.mask_input_size,
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	else:
	sam_mask_prompt = mask_inputs
	else:
	# Otherwise, simply feed None (and SAM's prompt encoder will add
	# a learned `no_mask_embed` to indicate no mask input in this case).
	sam_mask_prompt = None

	sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
	points=(sam_point_coords, sam_point_labels),
	boxes=None,
	masks=sam_mask_prompt,
	)

	## Extension: LLM prompt
	if language_embed is not None:
	# B, N, C
	assert sparse_embeddings.size(0) == language_embed.size(0)
	assert sparse_embeddings.size(2) == language_embed.size(2)
	sparse_embeddings = torch.cat([sparse_embeddings, language_embed], dim=1)

	(
	low_res_multimasks,
	ious,
	sam_output_tokens,
	object_score_logits,
	) = self.sam_mask_decoder(
	image_embeddings=backbone_features,
	image_pe=self.sam_prompt_encoder.get_dense_pe(),
	sparse_prompt_embeddings=sparse_embeddings,
	dense_prompt_embeddings=dense_embeddings,
	multimask_output=multimask_output,
	repeat_image=False, # the image is already batched
	high_res_features=high_res_features,
	)
	if self.pred_obj_scores:
	is_obj_appearing = object_score_logits > 0

	# Mask used for spatial memories is always a hard choice between obj and no obj,
	# consistent with the actual mask prediction
	# print('Do torch.where !!!')
	# low_res_multimasks = torch.where(
	# is_obj_appearing[:, None, None],
	# low_res_multimasks,
	# NO_OBJ_SCORE,
	# )

	# convert masks from possibly bfloat16 (or float16) to float32
	# (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
	low_res_multimasks = low_res_multimasks.float()
	high_res_multimasks = F.interpolate(
	low_res_multimasks,
	size=(self.image_size, self.image_size),
	mode="bilinear",
	align_corners=False,
	)

	sam_output_token = sam_output_tokens[:, 0]
	if multimask_output:
	# take the best mask prediction (with the highest IoU estimation)
	best_iou_inds = torch.argmax(ious, dim=-1)
	batch_inds = torch.arange(B, device=device)
	low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	if sam_output_tokens.size(1) > 1:
	sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
	else:
	low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks

	# Extract object pointer from the SAM output token (with occlusion handling)
	obj_ptr = self.obj_ptr_proj(sam_output_token)
	if self.pred_obj_scores:
	# Allow soft no obj ptr, unlike for masks
	if self.soft_no_obj_ptr:
	# Only hard possible with gt
	assert not self.teacher_force_obj_scores_for_mem
	lambda_is_obj_appearing = object_score_logits.sigmoid()
	else:
	lambda_is_obj_appearing = is_obj_appearing.float()

	if self.fixed_no_obj_ptr:
	obj_ptr = lambda_is_obj_appearing * obj_ptr
	obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr

	return (
	low_res_multimasks,
	high_res_multimasks,
	ious,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	object_score_logits,
	)


	class ImageEncoder(nn.Module):
	def __init__(
	self,
	trunk: nn.Module,
	neck: nn.Module,
	scalp: int = 0,
	):
	super().__init__()
	self.trunk = trunk
	self.neck = neck
	self.scalp = scalp
	assert (
	self.trunk.channel_list == self.neck.backbone_channel_list
	), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"

	def forward(self, sample: torch.Tensor):
	# Forward through backbone
	features, pos = self.neck(self.trunk(sample))
	if self.scalp > 0:
	# Discard the lowest resolution features
	features, pos = features[: -self.scalp], pos[: -self.scalp]

	src = features[-1]
	output = {
	"vision_features": src,
	"vision_pos_enc": pos,
	"backbone_fpn": features,
	}
	return output

	def window_partition(x, window_size):
	"""
	Partition into non-overlapping windows with padding if needed.
	Args:
	x (tensor): input tokens with [B, H, W, C].
	window_size (int): window size.
	Returns:
	windows: windows after partition with [B * num_windows, window_size, window_size, C].
	(Hp, Wp): padded height and width before partition
	"""
	B, H, W, C = x.shape

	pad_h = (window_size - H % window_size) % window_size
	pad_w = (window_size - W % window_size) % window_size
	if pad_h > 0 or pad_w > 0:
	x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
	Hp, Wp = H + pad_h, W + pad_w

	x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C).contiguous()
	windows = x.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C).contiguous()
	return windows, (Hp, Wp)


	def window_unpartition(windows, window_size, pad_hw, hw):
	"""
	Window unpartition into original sequences and removing padding.
	Args:
	x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
	window_size (int): window size.
	pad_hw (Tuple): padded height and width (Hp, Wp).
	hw (Tuple): original height and width (H, W) before padding.
	Returns:
	x: unpartitioned sequences with [B, H, W, C].
	"""
	Hp, Wp = pad_hw
	H, W = hw
	B = windows.shape[0] // (Hp * Wp // window_size // window_size)
	x = windows.reshape(
	B, Hp // window_size, Wp // window_size, window_size, window_size, -1
	).contiguous()
	x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1).contiguous()

	if Hp > H or Wp > W:
	x = x[:, :H, :W, :]
	return x

	class DropPath(nn.Module):
	# adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
	def __init__(self, drop_prob=0.0, scale_by_keep=True):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob
	self.scale_by_keep = scale_by_keep

	def forward(self, x):
	if self.drop_prob == 0.0 or not self.training:
	return x
	keep_prob = 1 - self.drop_prob
	shape = (x.shape[0],) + (1,) * (x.ndim - 1)
	random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
	if keep_prob > 0.0 and self.scale_by_keep:
	random_tensor.div_(keep_prob)
	return x * random_tensor

	class PatchEmbed(nn.Module):
	"""
	Image to Patch Embedding.
	"""

	def __init__(
	self,
	kernel_size: Tuple[int, ...] = (7, 7),
	stride: Tuple[int, ...] = (4, 4),
	padding: Tuple[int, ...] = (3, 3),
	in_chans: int = 3,
	embed_dim: int = 768,
	):
	"""
	Args:
	kernel_size (Tuple): kernel size of the projection layer.
	stride (Tuple): stride of the projection layer.
	padding (Tuple): padding size of the projection layer.
	in_chans (int): Number of input image channels.
	embed_dim (int): embed_dim (int): Patch embedding dimension.
	"""
	super().__init__()
	self.proj = nn.Conv2d(
	in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.proj(x)
	# B C H W -> B H W C
	x = x.permute(0, 2, 3, 1).contiguous()
	return x

	class FpnNeck(nn.Module):
	"""
	A modified variant of Feature Pyramid Network (FPN) neck
	(we remove output conv and also do bicubic interpolation similar to ViT
	pos embed interpolation)
	"""

	def __init__(
	self,
	position_encoding: nn.Module,
	d_model: int,
	backbone_channel_list: List[int],
	kernel_size: int = 1,
	stride: int = 1,
	padding: int = 0,
	fpn_interp_model: str = "bilinear",
	fuse_type: str = "sum",
	fpn_top_down_levels: Optional[List[int]] = None,
	):
	"""Initialize the neck
	:param trunk: the backbone
	:param position_encoding: the positional encoding to use
	:param d_model: the dimension of the model
	:param neck_norm: the normalization to use
	"""
	super().__init__()
	self.position_encoding = position_encoding
	self.convs = nn.ModuleList()
	self.backbone_channel_list = backbone_channel_list
	self.d_model = d_model
	for dim in backbone_channel_list:
	current = nn.Sequential()
	current.add_module(
	"conv",
	nn.Conv2d(
	in_channels=dim,
	out_channels=d_model,
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	),
	)

	self.convs.append(current)
	self.fpn_interp_model = fpn_interp_model
	assert fuse_type in ["sum", "avg"]
	self.fuse_type = fuse_type

	# levels to have top-down features in its outputs
	# e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
	# have top-down propagation, while outputs of level 0 and level 1 have only
	# lateral features from the same backbone level.
	if fpn_top_down_levels is None:
	# default is to have top-down features on all levels
	fpn_top_down_levels = range(len(self.convs))
	self.fpn_top_down_levels = list(fpn_top_down_levels)

	def forward(self, xs: List[torch.Tensor]):

	out = [None] * len(self.convs)
	pos = [None] * len(self.convs)
	assert len(xs) == len(self.convs)
	# fpn forward pass
	# see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
	prev_features = None
	# forward in top-down order (from low to high resolution)
	n = len(self.convs) - 1
	for i in range(n, -1, -1):
	x = xs[i]
	lateral_features = self.convs[n - i](x)
	if i in self.fpn_top_down_levels and prev_features is not None:
	top_down_features = F.interpolate(
	prev_features.to(dtype=torch.float32),
	scale_factor=2.0,
	mode=self.fpn_interp_model,
	align_corners=(
	None if self.fpn_interp_model == "nearest" else False
	),
	antialias=False,
	)
	prev_features = lateral_features + top_down_features
	if self.fuse_type == "avg":
	prev_features /= 2
	else:
	prev_features = lateral_features
	x_out = prev_features
	out[i] = x_out
	pos[i] = self.position_encoding(x_out).to(x_out.dtype)

	return out, pos


	def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
	if pool is None:
	return x
	# (B, H, W, C) -> (B, C, H, W)
	x = x.permute(0, 3, 1, 2).contiguous()
	x = pool(x)
	# (B, C, H', W') -> (B, H', W', C)
	x = x.permute(0, 2, 3, 1).contiguous()
	if norm:
	x = norm(x)

	return x


	class MultiScaleAttention(nn.Module):
	def __init__(
	self,
	dim: int,
	dim_out: int,
	num_heads: int,
	q_pool: nn.Module = None,
	):
	super().__init__()

	self.dim = dim
	self.dim_out = dim_out
	self.num_heads = num_heads
	self.q_pool = q_pool
	self.qkv = nn.Linear(dim, dim_out * 3)
	self.proj = nn.Linear(dim_out, dim_out)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	B, H, W, _ = x.shape
	# qkv with shape (B, H * W, 3, nHead, C)
	qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).contiguous()
	# q, k, v with shape (B, H * W, nheads, C)
	q, k, v = torch.unbind(qkv, 2)

	# Q pooling (for downsample at stage changes)
	if self.q_pool:
	q = do_pool(q.reshape(B, H, W, -1).contiguous(), self.q_pool)
	H, W = q.shape[1:3] # downsampled shape
	q = q.reshape(B, H * W, self.num_heads, -1).contiguous()

	# Torch's SDPA expects [B, nheads, H*W, C] so we transpose
	x = F.scaled_dot_product_attention(
	q.transpose(1, 2).contiguous(),
	k.transpose(1, 2).contiguous(),
	v.transpose(1, 2).contiguous(),
	)
	# Transpose back
	x = x.transpose(1, 2).contiguous()
	x = x.reshape(B, H, W, -1).contiguous()

	x = self.proj(x)

	return x

	class MultiScaleBlock(nn.Module):
	def __init__(
	self,
	dim: int,
	dim_out: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	drop_path: float = 0.0,
	norm_layer: Union[nn.Module, str] = "LayerNorm",
	q_stride: Tuple[int, int] = None,
	act_layer: nn.Module = nn.GELU,
	window_size: int = 0,
	):
	super().__init__()

	if isinstance(norm_layer, str):
	norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)

	self.dim = dim
	self.dim_out = dim_out
	self.norm1 = norm_layer(dim)

	self.window_size = window_size

	self.pool, self.q_stride = None, q_stride
	if self.q_stride:
	self.pool = nn.MaxPool2d(
	kernel_size=q_stride, stride=q_stride, ceil_mode=False
	)

	self.attn = MultiScaleAttention(
	dim,
	dim_out,
	num_heads=num_heads,
	q_pool=self.pool,
	)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

	self.norm2 = norm_layer(dim_out)
	self.mlp = MLP(
	dim_out,
	int(dim_out * mlp_ratio),
	dim_out,
	num_layers=2,
	activation=act_layer,
	)

	if dim != dim_out:
	self.proj = nn.Linear(dim, dim_out)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	shortcut = x # B, H, W, C
	x = self.norm1(x)

	# Skip connection
	if self.dim != self.dim_out:
	shortcut = do_pool(self.proj(x), self.pool)

	# Window partition
	window_size = self.window_size
	if window_size > 0:
	H, W = x.shape[1], x.shape[2]
	x, pad_hw = window_partition(x, window_size)

	# Window Attention + Q Pooling (if stage change)
	x = self.attn(x)
	if self.q_stride:
	# Shapes have changed due to Q pooling
	window_size = self.window_size // self.q_stride[0]
	H, W = shortcut.shape[1:3]

	pad_h = (window_size - H % window_size) % window_size
	pad_w = (window_size - W % window_size) % window_size
	pad_hw = (H + pad_h, W + pad_w)

	# Reverse window partition
	if self.window_size > 0:
	x = window_unpartition(x, window_size, pad_hw, (H, W))

	x = shortcut + self.drop_path(x)
	# MLP
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x


	class Hiera(nn.Module):
	"""
	Reference: https://arxiv.org/abs/2306.00989
	"""

	def __init__(
	self,
	embed_dim: int = 96, # initial embed dim
	num_heads: int = 1, # initial number of heads
	drop_path_rate: float = 0.0, # stochastic depth
	q_pool: int = 3, # number of q_pool stages
	q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages
	stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
	dim_mul: float = 2.0, # dim_mul factor at stage shift
	head_mul: float = 2.0, # head_mul factor at stage shift
	window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
	# window size per stage, when not using global att.
	window_spec: Tuple[int, ...] = (
	8,
	4,
	14,
	7,
	),
	# global attn in these blocks
	global_att_blocks: Tuple[int, ...] = (
	12,
	16,
	20,
	),
	weights_path=None,
	return_interm_layers=True, # return feats from every stage
	):
	super().__init__()

	assert len(stages) == len(window_spec)
	self.window_spec = window_spec

	depth = sum(stages)
	self.q_stride = q_stride
	self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
	assert 0 <= q_pool <= len(self.stage_ends[:-1])
	self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
	self.return_interm_layers = return_interm_layers

	self.patch_embed = PatchEmbed(
	embed_dim=embed_dim,
	)
	# Which blocks have global att?
	self.global_att_blocks = global_att_blocks

	# Windowed positional embedding (https://arxiv.org/abs/2311.05613)
	self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
	self.pos_embed = nn.Parameter(
	torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
	)
	self.pos_embed_window = nn.Parameter(
	torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
	)

	dpr = [
	x.item() for x in torch.linspace(0, drop_path_rate, depth)
	] # stochastic depth decay rule

	cur_stage = 1
	self.blocks = nn.ModuleList()

	for i in range(depth):
	dim_out = embed_dim
	# lags by a block, so first block of
	# next stage uses an initial window size
	# of previous stage and final window size of current stage
	window_size = self.window_spec[cur_stage - 1]

	if self.global_att_blocks is not None:
	window_size = 0 if i in self.global_att_blocks else window_size

	if i - 1 in self.stage_ends:
	dim_out = int(embed_dim * dim_mul)
	num_heads = int(num_heads * head_mul)
	cur_stage += 1

	block = MultiScaleBlock(
	dim=embed_dim,
	dim_out=dim_out,
	num_heads=num_heads,
	drop_path=dpr[i],
	q_stride=self.q_stride if i in self.q_pool_blocks else None,
	window_size=window_size,
	)

	embed_dim = dim_out
	self.blocks.append(block)

	self.channel_list = (
	[self.blocks[i].dim_out for i in self.stage_ends[::-1]]
	if return_interm_layers
	else [self.blocks[-1].dim_out]
	)

	def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
	h, w = hw
	window_embed = self.pos_embed_window
	pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
	pos_embed = pos_embed + window_embed.tile(
	[x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
	)
	pos_embed = pos_embed.permute(0, 2, 3, 1).contiguous()
	return pos_embed

	def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
	x = self.patch_embed(x)
	# x: (B, H, W, C)

	# Add pos embed
	x = x + self._get_pos_embed(x.shape[1:3])

	outputs = []
	for i, blk in enumerate(self.blocks):
	x = blk(x)
	if (i == self.stage_ends[-1]) or (
	i in self.stage_ends and self.return_interm_layers
	):
	feats = x.permute(0, 3, 1, 2).contiguous()
	outputs.append(feats)

	return outputs

	def get_layer_id(self, layer_name):
	# https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
	num_layers = self.get_num_layers()

	if layer_name.find("rel_pos") != -1:
	return num_layers + 1
	elif layer_name.find("pos_embed") != -1:
	return 0
	elif layer_name.find("patch_embed") != -1:
	return 0
	elif layer_name.find("blocks") != -1:
	return int(layer_name.split("blocks")[1].split(".")[1]) + 1
	else:
	return num_layers + 1

	def get_num_layers(self) -> int:
	return len(self.blocks)


	class PositionEmbeddingSine(nn.Module):
	"""
	This is a more standard version of the position embedding, very similar to the one
	used by the Attention Is All You Need paper, generalized to work on images.
	"""

	def __init__(
	self,
	num_pos_feats,
	temperature: int = 10000,
	normalize: bool = True,
	scale: Optional[float] = None,
	# Following settings only relevant
	# for warmping up cache for compilation
	warmup_cache: bool = True,
	image_size: int = 1024,
	strides: Tuple[int] = (4, 8, 16, 32),
	):
	super().__init__()
	assert num_pos_feats % 2 == 0, "Expecting even model width"
	self.num_pos_feats = num_pos_feats // 2
	self.temperature = temperature
	self.normalize = normalize
	if scale is not None and normalize is False:
	raise ValueError("normalize should be True if scale is passed")
	if scale is None:
	scale = 2 * math.pi
	self.scale = scale

	self.cache = {}
	if warmup_cache and torch.cuda.is_available():
	# Warmup cache for cuda, to help with compilation
	device = torch.device("cuda")
	for stride in strides:
	cache_key = (image_size // stride, image_size // stride)
	self._pe(1, device, *cache_key)

	def _encode_xy(self, x, y):
	# The positions are expected to be normalized
	assert len(x) == len(y) and x.ndim == y.ndim == 1
	x_embed = x * self.scale
	y_embed = y * self.scale

	dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
	dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

	pos_x = x_embed[:, None] / dim_t
	pos_y = y_embed[:, None] / dim_t
	pos_x = torch.stack(
	(pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
	).flatten(1)
	pos_y = torch.stack(
	(pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
	).flatten(1)
	return pos_x, pos_y

	@torch.no_grad()
	def encode_boxes(self, x, y, w, h):
	pos_x, pos_y = self._encode_xy(x, y)
	pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
	return pos

	encode = encode_boxes # Backwards compatibility

	@torch.no_grad()
	def encode_points(self, x, y, labels):
	(bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
	assert bx == by and nx == ny and bx == bl and nx == nl
	pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
	pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
	pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
	return pos

	@torch.no_grad()
	def _pe(self, B, device, *cache_key):
	H, W = cache_key
	if cache_key in self.cache:
	return self.cache[cache_key].to(device)[None].repeat(B, 1, 1, 1)

	y_embed = (
	torch.arange(1, H + 1, dtype=torch.float32, device=device)
	.view(1, -1, 1)
	.repeat(B, 1, W)
	)
	x_embed = (
	torch.arange(1, W + 1, dtype=torch.float32, device=device)
	.view(1, 1, -1)
	.repeat(B, H, 1)
	)

	if self.normalize:
	eps = 1e-6
	y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
	x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

	dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=device)
	dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

	pos_x = x_embed[:, :, :, None] / dim_t
	pos_y = y_embed[:, :, :, None] / dim_t
	pos_x = torch.stack(
	(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
	).flatten(3)
	pos_y = torch.stack(
	(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
	).flatten(3)
	pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
	self.cache[cache_key] = pos[0]
	return pos

	@torch.no_grad()
	def forward(self, x: torch.Tensor):
	B = x.shape[0]
	cache_key = (x.shape[-2], x.shape[-1])
	return self._pe(B, x.device, *cache_key)

	def get_activation_fn(activation):
	"""Return an activation function given a string"""
	if activation == "relu":
	return F.relu
	if activation == "gelu":
	return F.gelu
	if activation == "glu":
	return F.glu
	raise RuntimeError(f"activation should be relu/gelu, not {activation}.")

	def get_clones(module, N):
	return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

	class MemoryAttentionLayer(nn.Module):

	def __init__(
	self,
	activation: str,
	cross_attention: nn.Module,
	d_model: int,
	dim_feedforward: int,
	dropout: float,
	pos_enc_at_attn: bool,
	pos_enc_at_cross_attn_keys: bool,
	pos_enc_at_cross_attn_queries: bool,
	self_attention: nn.Module,
	):
	super().__init__()
	self.d_model = d_model
	self.dim_feedforward = dim_feedforward
	self.dropout_value = dropout
	self.self_attn = self_attention
	self.cross_attn_image = cross_attention

	# Implementation of Feedforward model
	self.linear1 = nn.Linear(d_model, dim_feedforward)
	self.dropout = nn.Dropout(dropout)
	self.linear2 = nn.Linear(dim_feedforward, d_model)

	self.norm1 = nn.LayerNorm(d_model)
	self.norm2 = nn.LayerNorm(d_model)
	self.norm3 = nn.LayerNorm(d_model)
	self.dropout1 = nn.Dropout(dropout)
	self.dropout2 = nn.Dropout(dropout)
	self.dropout3 = nn.Dropout(dropout)

	self.activation_str = activation
	self.activation = get_activation_fn(activation)

	# Where to add pos enc
	self.pos_enc_at_attn = pos_enc_at_attn
	self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
	self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys

	def _forward_sa(self, tgt, query_pos):
	# Self-Attention
	tgt2 = self.norm1(tgt)
	q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
	tgt2 = self.self_attn(q, k, v=tgt2)
	tgt = tgt + self.dropout1(tgt2)
	return tgt

	def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
	kwds = {}
	if num_k_exclude_rope > 0:
	assert isinstance(self.cross_attn_image, RoPEAttention)
	kwds = {"num_k_exclude_rope": num_k_exclude_rope}

	# Cross-Attention
	tgt2 = self.norm2(tgt)
	tgt2 = self.cross_attn_image(
	q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
	k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
	v=memory,
	**kwds,
	)
	tgt = tgt + self.dropout2(tgt2)
	return tgt

	def forward(
	self,
	tgt,
	memory,
	pos: Optional[Tensor] = None,
	query_pos: Optional[Tensor] = None,
	num_k_exclude_rope: int = 0,
	) -> torch.Tensor:

	# Self-Attn, Cross-Attn
	tgt = self._forward_sa(tgt, query_pos)
	tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
	# MLP
	tgt2 = self.norm3(tgt)
	tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
	tgt = tgt + self.dropout3(tgt2)
	return tgt


	class MemoryAttention(nn.Module):
	def __init__(
	self,
	d_model: int,
	pos_enc_at_input: bool,
	layer: nn.Module,
	num_layers: int,
	batch_first: bool = True, # Do layers expect batch first input?
	):
	super().__init__()
	self.d_model = d_model
	self.layers = get_clones(layer, num_layers)
	self.num_layers = num_layers
	self.norm = nn.LayerNorm(d_model)
	self.pos_enc_at_input = pos_enc_at_input
	self.batch_first = batch_first

	def forward(
	self,
	curr: torch.Tensor, # self-attention inputs
	memory: torch.Tensor, # cross-attention inputs
	curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs
	memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs
	num_obj_ptr_tokens: int = 0, # number of object pointer tokens
	):
	if isinstance(curr, list):
	assert isinstance(curr_pos, list)
	assert len(curr) == len(curr_pos) == 1
	curr, curr_pos = (
	curr[0],
	curr_pos[0],
	)

	assert (
	curr.shape[1] == memory.shape[1]
	), "Batch size must be the same for curr and memory"

	output = curr
	if self.pos_enc_at_input and curr_pos is not None:
	output = output + 0.1 * curr_pos

	if self.batch_first:
	# Convert to batch first
	output = output.transpose(0, 1).contiguous()
	curr_pos = curr_pos.transpose(0, 1).contiguous()
	memory = memory.transpose(0, 1).contiguous()
	memory_pos = memory_pos.transpose(0, 1).contiguous()

	for layer in self.layers:
	kwds = {}
	if isinstance(layer.cross_attn_image, RoPEAttention):
	kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}

	output = layer(
	tgt=output,
	memory=memory,
	pos=memory_pos,
	query_pos=curr_pos,
	**kwds,
	)
	normed_output = self.norm(output)

	if self.batch_first:
	# Convert back to seq first
	normed_output = normed_output.transpose(0, 1).contiguous()
	curr_pos = curr_pos.transpose(0, 1).contiguous()

	return normed_output

	class MaskDownSampler(nn.Module):
	"""
	Progressively downsample a mask by total_stride, each time by stride.
	Note that LayerNorm is applied per token, like in ViT.

	With each downsample (by a factor stride**2), channel capacity increases by the same factor.
	In the end, we linearly project to embed_dim channels.
	"""

	def __init__(
	self,
	embed_dim=256,
	kernel_size=4,
	stride=4,
	padding=0,
	total_stride=16,
	activation=nn.GELU,
	):
	super().__init__()
	num_layers = int(math.log2(total_stride) // math.log2(stride))
	assert stride**num_layers == total_stride
	self.encoder = nn.Sequential()
	mask_in_chans, mask_out_chans = 1, 1
	for _ in range(num_layers):
	mask_out_chans = mask_in_chans * (stride**2)
	self.encoder.append(
	nn.Conv2d(
	mask_in_chans,
	mask_out_chans,
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	)
	)
	self.encoder.append(LayerNorm2d(mask_out_chans))
	self.encoder.append(activation())
	mask_in_chans = mask_out_chans

	self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))

	def forward(self, x):
	return self.encoder(x)


	# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
	class CXBlock(nn.Module):
	r"""ConvNeXt Block. There are two equivalent implementations:
	(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
	(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
	We use (2) as we find it slightly faster in PyTorch

	Args:
	dim (int): Number of input channels.
	drop_path (float): Stochastic depth rate. Default: 0.0
	layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
	"""

	def __init__(
	self,
	dim,
	kernel_size=7,
	padding=3,
	drop_path=0.0,
	layer_scale_init_value=1e-6,
	use_dwconv=True,
	):
	super().__init__()
	self.dwconv = nn.Conv2d(
	dim,
	dim,
	kernel_size=kernel_size,
	padding=padding,
	groups=dim if use_dwconv else 1,
	) # depthwise conv
	self.norm = LayerNorm2d(dim, eps=1e-6)
	self.pwconv1 = nn.Linear(
	dim, 4 * dim
	) # pointwise/1x1 convs, implemented with linear layers
	self.act = nn.GELU()
	self.pwconv2 = nn.Linear(4 * dim, dim)
	self.gamma = (
	nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
	if layer_scale_init_value > 0
	else None
	)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

	def forward(self, x):
	input = x
	x = self.dwconv(x)
	x = self.norm(x)
	x = x.permute(0, 2, 3, 1).contiguous() # (N, C, H, W) -> (N, H, W, C)
	x = self.pwconv1(x)
	x = self.act(x)
	x = self.pwconv2(x)
	if self.gamma is not None:
	x = self.gamma * x
	x = x.permute(0, 3, 1, 2).contiguous() # (N, H, W, C) -> (N, C, H, W)

	x = input + self.drop_path(x)
	return x


	class Fuser(nn.Module):
	def __init__(self, layer, num_layers, dim=None, input_projection=False):
	super().__init__()
	self.proj = nn.Identity()
	self.layers = get_clones(layer, num_layers)

	if input_projection:
	assert dim is not None
	self.proj = nn.Conv2d(dim, dim, kernel_size=1)

	def forward(self, x):
	# normally x: (N, C, H, W)
	x = self.proj(x)
	for layer in self.layers:
	x = layer(x)
	return x


	class MemoryEncoder(nn.Module):
	def __init__(
	self,
	out_dim,
	mask_downsampler,
	fuser,
	position_encoding,
	in_dim=256, # in_dim of pix_feats
	):
	super().__init__()

	self.mask_downsampler = mask_downsampler

	self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
	self.fuser = fuser
	self.position_encoding = position_encoding
	self.out_proj = nn.Identity()
	if out_dim != in_dim:
	self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)

	def forward(
	self,
	pix_feat: torch.Tensor,
	masks: torch.Tensor,
	skip_mask_sigmoid: bool = False,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	## Process masks
	# sigmoid, so that less domain shift from gt masks which are bool
	if not skip_mask_sigmoid:
	masks = F.sigmoid(masks)
	masks = self.mask_downsampler(masks)

	## Fuse pix_feats and downsampled masks
	# in case the visual features are on CPU, cast them to CUDA
	pix_feat = pix_feat.to(masks.device)

	x = self.pix_feat_proj(pix_feat)
	x = x + masks
	x = self.fuser(x)
	x = self.out_proj(x)

	pos = self.position_encoding(x).to(x.dtype)

	return {"vision_features": x, "vision_pos_enc": [pos]}

	def load_checkpoint_with_prefix(filename, prefix=None, map_location='cpu', logger='current'):
	"""Load partial pretrained model with specific prefix.

	Args:
	prefix (str): The prefix of sub-module.
	filename (str): Accept local filepath, URL, ``torchvision://xxx``,
	``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
	details.
	map_location (str \| None): Same as :func:`torch.load`.
	Defaults to None.
	logger: logger

	Returns:
	dict or OrderedDict: The loaded checkpoint.
	"""
	checkpoint = torch.load(filename, map_location=map_location)

	if 'state_dict' in checkpoint:
	state_dict = checkpoint['state_dict']
	elif 'model' in checkpoint:
	state_dict = checkpoint['model']
	else:
	state_dict = checkpoint
	if not prefix:
	return state_dict
	if not prefix.endswith('.'):
	prefix += '.'
	prefix_len = len(prefix)

	state_dict = {
	k[prefix_len:]: v
	for k, v in state_dict.items() if k.startswith(prefix)
	}

	assert state_dict, f'{prefix} is not in the pretrained model'
	return state_dict

	def load_state_dict_to_model(model, state_dict, logger='current'):
	missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
	if missing_keys:
	print("========>>>MISSING_KEYS: ", missing_keys)
	# raise RuntimeError()
	if unexpected_keys:
	print("========>>>UNEXPECTED_KEYS: ", unexpected_keys)
	raise RuntimeError()
	print("Loaded checkpoint successfully")

	class SAM2Model(PreTrainedModel):
	config_class = SAM2Config
	base_model_prefix = "sam2"
	main_input_name = "pixel_values"
	supports_gradient_checkpointing = True
	_supports_sdpa = True

	def __init__(self, config):
	super().__init__(config)

	image_encoder = self.build_image_encoder()
	memory_attention = self.build_memory_attention()
	memory_encoder = self.build_memory_encoder()
	sam2_model = SAM2Base(
	image_encoder=image_encoder,
	memory_attention=memory_attention,
	memory_encoder=memory_encoder,
	num_maskmem = 7,
	image_size = 1024,
	# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
	sigmoid_scale_for_mem_enc = 20.0,
	sigmoid_bias_for_mem_enc = -10.0,
	use_mask_input_as_output_without_sam = True,
	# Memory
	directly_add_no_mem_embed = True,
	no_obj_embed_spatial = True,
	# use high-resolution feature map in the SAM mask decoder
	use_high_res_features_in_sam = True,
	# output 3 masks on the first click on initial conditioning frames
	multimask_output_in_sam = True,
	# SAM heads
	iou_prediction_use_sigmoid = True,
	# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
	use_obj_ptrs_in_encoder = True,
	add_tpos_enc_to_obj_ptrs = True,
	proj_tpos_enc_in_obj_ptrs = True,
	use_signed_tpos_enc_to_obj_ptrs = True,
	only_obj_ptrs_in_the_past_for_eval = True,
	# object occlusion prediction
	pred_obj_scores = True,
	pred_obj_scores_mlp = True,
	fixed_no_obj_ptr = True,
	# multimask tracking settings
	multimask_output_for_tracking = True,
	use_multimask_token_for_obj_ptr = True,
	multimask_min_pt_num = 0,
	multimask_max_pt_num = 1,
	use_mlp_for_obj_ptr_proj = True,
	# Compilation flag
	compile_image_encoder = False,
	# sam_mask_decoder_extra_args={
	# 'dynamic_multimask_via_stability':True,
	# 'dynamic_multimask_stability_delta': 0.05,
	# 'dynamic_multimask_stability_thresh': 0.98,
	# }
	)

	state_dict = load_checkpoint_with_prefix(config.ckpt_path)
	load_state_dict_to_model(sam2_model, state_dict)

	self.sam2_model = sam2_model

	self.hidden_dim = self.sam2_model.hidden_dim
	self.img_mean = (0.485, 0.456, 0.406)
	self.img_std = (0.229, 0.224, 0.225)

	def build_image_encoder(self):
	def build_trunk():
	embed_dim = 144
	num_heads = 2
	stages = [2, 6, 36, 4]
	global_att_blocks = [23, 33, 43]
	window_pos_embed_bkg_spatial_size = [7, 7]
	window_spec = [8, 4, 16, 8]
	ret = Hiera(
	embed_dim=embed_dim,
	num_heads=num_heads,
	stages=stages,
	global_att_blocks=global_att_blocks,
	window_pos_embed_bkg_spatial_size=window_pos_embed_bkg_spatial_size,
	window_spec=window_spec,
	)
	return ret
	def build_neck():
	def build_position_encoding():
	num_pos_feats = 256
	normalize = True
	scale = None
	temperature = 10000
	ret = PositionEmbeddingSine(
	num_pos_feats=num_pos_feats,
	normalize=normalize,
	scale=scale,
	temperature=temperature,
	)
	return ret
	d_model = 256
	backbone_channel_list = [1152, 576, 288, 144]
	fpn_top_down_levels = [2, 3] # output level 0 and 1 directly use the backbone features
	fpn_interp_model = 'nearest'
	position_encoding = build_position_encoding()
	ret = FpnNeck(
	d_model=d_model,
	position_encoding=position_encoding,
	backbone_channel_list=backbone_channel_list,
	fpn_top_down_levels=fpn_top_down_levels,
	fpn_interp_model=fpn_interp_model,
	)
	return ret
	scalp = 1
	trunk = build_trunk()
	neck = build_neck()
	ret = ImageEncoder(scalp=scalp, trunk=trunk, neck=neck)
	return ret

	def build_memory_attention(self):
	def build_layer():
	def build_self_attention():
	rope_theta = 10000.0
	feat_sizes = [64, 64]
	embedding_dim = 256
	num_heads = 1
	downsample_rate = 1
	dropout = 0.1
	ret = RoPEAttention(
	rope_theta=rope_theta,
	feat_sizes=feat_sizes,
	embedding_dim=embedding_dim,
	num_heads=num_heads,
	downsample_rate=downsample_rate,
	dropout=dropout
	)
	return ret
	def build_cross_attention():
	rope_theta = 10000.0
	feat_sizes = [64, 64]
	rope_k_repeat = True
	embedding_dim = 256
	num_heads = 1
	downsample_rate = 1
	dropout = 0.1
	kv_in_dim = 64
	ret = RoPEAttention(
	rope_theta=rope_theta,
	feat_sizes=feat_sizes,
	rope_k_repeat=rope_k_repeat,
	embedding_dim=embedding_dim,
	num_heads=num_heads,
	downsample_rate=downsample_rate,
	dropout=dropout,
	kv_in_dim=kv_in_dim
	)
	return ret
	activation = 'relu'
	dim_feedforward = 2048
	dropout = 0.1
	pos_enc_at_attn = False
	d_model = 256
	pos_enc_at_cross_attn_keys = True
	pos_enc_at_cross_attn_queries = False
	self_attention = build_self_attention()
	cross_attention = build_cross_attention()
	ret = MemoryAttentionLayer(
	activation=activation,
	dim_feedforward=dim_feedforward,
	dropout=dropout,
	pos_enc_at_attn=pos_enc_at_attn,
	d_model=d_model,
	pos_enc_at_cross_attn_queries=pos_enc_at_cross_attn_queries,
	pos_enc_at_cross_attn_keys=pos_enc_at_cross_attn_keys,
	self_attention=self_attention,
	cross_attention=cross_attention,
	)
	return ret
	d_model = 256
	pos_enc_at_input = True
	num_layers = 4
	layer = build_layer()
	ret = MemoryAttention(
	d_model=d_model,
	pos_enc_at_input=pos_enc_at_input,
	num_layers=num_layers,
	layer=layer,
	)
	return ret

	def build_memory_encoder(self):
	def build_position_encoding():
	num_pos_feats = 64
	normalize = True
	scale = None
	temperature = 10000
	ret = PositionEmbeddingSine(
	num_pos_feats=num_pos_feats,
	normalize=normalize,
	scale=scale,
	temperature=temperature,
	)
	return ret

	def build_mask_downsampler():
	kernel_size = 3
	stride = 2
	padding = 1
	ret = MaskDownSampler(
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	)
	return ret

	def build_fuser():
	def build_layer():
	dim = 256
	kernel_size = 7
	padding = 3
	layer_scale_init_value = 1e-6
	use_dwconv = True # depth-wise convs
	ret = CXBlock(
	dim=dim, kernel_size=kernel_size,
	padding=padding, layer_scale_init_value=layer_scale_init_value,
	use_dwconv=use_dwconv,
	)
	return ret

	num_layers = 2
	layer = build_layer()
	ret = Fuser(
	layer=layer,
	num_layers=num_layers
	)
	return ret

	out_dim = 64
	position_encoding = build_position_encoding()
	mask_downsampler = build_mask_downsampler()
	fuser = build_fuser()
	ret = MemoryEncoder(
	out_dim=out_dim,
	position_encoding=position_encoding,
	mask_downsampler=mask_downsampler,
	fuser=fuser,
	)
	return ret


	def preprocess_image(self, image: torch.Tensor) -> torch.Tensor:
	image = image / 255.
	img_mean = torch.tensor(self.img_mean, dtype=image.dtype, device=image.device)[:, None, None]
	img_std = torch.tensor(self.img_std, dtype=image.dtype, device=image.device)[:, None, None]
	image -= img_mean
	image /= img_std
	return image

	def encode_mask_box_input(self, sam_states, mask_input, box_input_normalized, sam2_resolution=1024):
	if box_input_normalized is not None:
	box_input_normalized = box_input_normalized.reshape(-1, 2, 2)
	box_input_normalized = box_input_normalized * sam2_resolution
	box_labels = torch.tensor([[2,3]], dtype=torch.int, device=box_input_normalized.device)
	box_labels = box_labels.repeat(box_input_normalized.shape[0], 1)
	concat_points = (box_input_normalized, box_labels)
	else:
	concat_points = None

	sam_mask_prompt = [torch.nn.functional.interpolate(
	one_mask.unsqueeze(0).float(),
	size=self.sam2_model.sam_prompt_encoder.mask_input_size,
	align_corners=False,
	mode="bilinear",
	antialias=True).squeeze(0) for one_mask in mask_input]
	sam_mask_prompt = torch.cat(sam_mask_prompt, dim=0).unsqueeze(1)

	sparse_embeddings, dense_embeddings = self.sam2_model.sam_prompt_encoder(
	points=concat_points,
	boxes=None,
	masks=sam_mask_prompt,
	)

	B = sam_states['current_vision_feats'][-1].size(1) # batch size on this frame
	C = self.hidden_dim
	H, W = sam_states['feat_sizes'][-1]

	if self.sam2_model.directly_add_no_mem_embed:
	# directly add no-mem embedding (instead of using the transformer encoder)
	pix_feat_with_mem = sam_states['current_vision_feats'][-1] + self.sam2_model.no_mem_embed
	pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
	else:
	raise NotImplementedError("directly add no memory embedding is not implemented")
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	mask_tokens = self.sam2_model.sam_mask_encoder(
	image_embeddings=pix_feat_with_mem,
	image_pe=self.sam2_model.sam_prompt_encoder.get_dense_pe(),
	sparse_prompt_embeddings=sparse_embeddings,
	dense_prompt_embeddings=dense_embeddings,
	repeat_image=False,
	)

	return mask_tokens

	def inject_language_embd(self, sam_states, language_embed, nf_nobj=None):
	high_res_features = [
	x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
	for x, s in zip(sam_states['current_vision_feats'][:-1], sam_states['feat_sizes'][:-1])
	]

	B = sam_states['current_vision_feats'][-1].size(1) # batch size on this frame
	C = self.hidden_dim
	H, W = sam_states['feat_sizes'][-1]

	if self.sam2_model.directly_add_no_mem_embed:
	# directly add no-mem embedding (instead of using the transformer encoder)
	pix_feat_with_mem = sam_states['current_vision_feats'][-1] + self.sam2_model.no_mem_embed
	pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
	else:
	raise NotImplementedError("directly add no memory embedding is not implemented")
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	_, _, _, low_res_masks, high_res_masks, obj_ptr, _, = self.sam2_model._forward_sam_heads(
	backbone_features=pix_feat_with_mem,
	point_inputs=None,
	mask_inputs=None,
	high_res_features=high_res_features,
	multimask_output=self.sam2_model._use_multimask(is_init_cond_frame=True, point_inputs=None),
	# Inject language Embed if possible
	language_embed=language_embed,
	)

	if nf_nobj is not None:
	pred_masks = low_res_masks.squeeze(1)
	pred_masks = pred_masks.unflatten(0, nf_nobj)
	else:
	pred_masks = low_res_masks
	return pred_masks

	def get_sam2_embeddings(self, images, expand_size=1):
	# Step 1: inference the backbone with the images
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	feats = self.sam2_model.forward_image(images)

	if expand_size > 1:
	# feats['vision_features'] = feats['vision_features'][:, None].expand(-1, expand_size, -1, -1, -1).flatten(0, 1)
	for i, feat in enumerate(feats["backbone_fpn"]):
	feats["backbone_fpn"][i] = feat[:, None].expand(-1, expand_size, -1, -1, -1).flatten(0, 1)
	for i, pos in enumerate(feats["vision_pos_enc"]):
	pos = pos[:, None].expand(-1, expand_size, -1, -1, -1).flatten(0, 1)
	feats["vision_pos_enc"][i] = pos

	# Step 2: Process the features to output
	_, current_vision_feats, current_vision_pos_embeds, feat_sizes = self.sam2_model._prepare_backbone_features(feats)

	return {
	"current_vision_feats": current_vision_feats,
	"current_vision_pos_embeds": current_vision_pos_embeds,
	"feat_sizes": feat_sizes,
	}

	def forward(self, pixel_values):
	raise NotImplementedError


	class VQEmebedding(nn.Embedding):
	"""VQ embedding module with ema update."""

	def __init__(
	self,
	codebook_size: int,
	embedding_dim: int,
	ema: bool=True,
	decay: float=0.99,
	restart_unused_codes: bool=True,
	eps: float=1e-5,
	):
	super().__init__(num_embeddings=codebook_size+1, embedding_dim=embedding_dim, padding_idx=codebook_size)

	self.ema = ema
	self.decay = decay
	self.eps = eps
	self.restart_unused_codes = restart_unused_codes
	self.codebook_size = codebook_size

	if self.ema:
	_ = [p.requires_grad_(False) for p in self.parameters()]

	# padding index is not updated by EMA
	self.register_buffer('cluster_size_ema', torch.zeros(codebook_size))
	self.register_buffer('embed_ema', self.weight[:-1, :].detach().clone())

	@torch.no_grad()
	def compute_distances(self, inputs):
	codebook_t = self.weight[:-1, :].t().contiguous()

	(embed_dim, _) = codebook_t.shape
	inputs_shape = inputs.shape
	assert inputs_shape[-1] == embed_dim

	inputs_flat = inputs.reshape(-1, embed_dim).contiguous()

	inputs_norm_sq = inputs_flat.pow(2.).sum(dim=1, keepdim=True)
	codebook_t_norm_sq = codebook_t.pow(2.).sum(dim=0, keepdim=True)
	distances = torch.addmm(
	inputs_norm_sq + codebook_t_norm_sq,
	inputs_flat,
	codebook_t,
	alpha=-2.0,
	)
	distances = distances.reshape(*inputs_shape[:-1], -1).contiguous()
	return distances

	@torch.no_grad()
	def find_nearest_embedding(self, inputs):
	distances = self.compute_distances(inputs)
	embed_idxs = distances.argmin(dim=-1)

	return embed_idxs

	@torch.no_grad()
	def _tile_with_noise(self, x, target_n):
	B, embed_dim = x.shape
	n_repeats = (target_n + B -1) // B
	std = x.new_ones(embed_dim) * 0.01 / np.sqrt(embed_dim)
	x = x.repeat(n_repeats, 1)
	x = x + torch.rand_like(x) * std
	return x

	@torch.no_grad()
	def _update_buffers(self, vectors, idxs):

	n_embed, embed_dim = self.weight.shape[0]-1, self.weight.shape[-1]

	vectors = vectors.reshape(-1, embed_dim).contiguous()
	idxs = idxs.reshape(-1).contiguous()

	n_vectors = vectors.shape[0]
	n_total_embed = n_embed

	one_hot_idxs = vectors.new_zeros(n_total_embed, n_vectors)
	one_hot_idxs.scatter_(dim=0,
	index=idxs.unsqueeze(0),
	src=vectors.new_ones(1, n_vectors)
	)

	cluster_size = one_hot_idxs.sum(dim=1)
	vectors_sum_per_cluster = one_hot_idxs @ vectors

	assert dist.is_initialized()
	if dist.is_initialized():
	dist.all_reduce(vectors_sum_per_cluster, op=dist.ReduceOp.SUM)
	dist.all_reduce(cluster_size, op=dist.ReduceOp.SUM)

	self.cluster_size_ema.mul_(self.decay).add_(cluster_size, alpha=1 - self.decay)
	self.embed_ema.mul_(self.decay).add_(vectors_sum_per_cluster, alpha=1 - self.decay)

	if self.restart_unused_codes:
	if n_vectors < n_embed:
	vectors = self._tile_with_noise(vectors, n_embed)
	n_vectors = vectors.shape[0]
	_vectors_random = vectors[torch.randperm(n_vectors, device=vectors.device)][:n_embed]

	assert dist.is_initialized()
	if dist.is_initialized():
	dist.broadcast(_vectors_random, 0)

	usage = (self.cluster_size_ema.view(-1, 1) >= 1).float()
	self.embed_ema.mul_(usage).add_(_vectors_random * (1-usage))
	self.cluster_size_ema.mul_(usage.view(-1))
	self.cluster_size_ema.add_(torch.ones_like(self.cluster_size_ema) * (1-usage).view(-1))

	@torch.no_grad()
	def _update_embedding(self):

	n_embed = self.weight.shape[0] - 1
	n = self.cluster_size_ema.sum()
	normalized_cluster_size = (
	n * (self.cluster_size_ema + self.eps) / (n + n_embed * self.eps)
	)
	self.weight[:-1, :] = self.embed_ema / normalized_cluster_size.reshape(-1, 1).contiguous()

	def forward(self, inputs, freeze_codebook=False):
	embed_idxs = self.find_nearest_embedding(inputs)
	if self.training and self.ema and not freeze_codebook:
	self._update_buffers(inputs, embed_idxs)

	embeds = self.embed(embed_idxs)

	if self.ema and self.training and not freeze_codebook:
	print("================>here: self._update_embedding()")
	# exit(0)
	self._update_embedding()
	# print("================>self.ema and self.training and not freeze_codebook: ", self.ema and self.training and not freeze_codebook)

	return embeds, embed_idxs

	def embed(self, idxs):
	embeds = super().forward(idxs)
	return embeds

	class ResidualQuantizer(nn.Module):
	def __init__(
	self,
	codebook_size: int,
	latent_dim: int,
	codebook_depth: int,
	decay: float = 0.99,
	shared_codebook: bool = False,
	restart_unused_codes: bool = True,
	commitment_loss: str = 'cumsum'
	):
	super().__init__()

	self.shared_codebook = shared_codebook
	if self.shared_codebook:
	if isinstance(codebook_size, Iterable) or isinstance(decay, Iterable):
	raise ValueError("Shared codebooks are incompatible with list types of momentums or sizes: Change it into int")

	self.restart_unused_codes = restart_unused_codes
	self.codebook_size = codebook_size if isinstance(codebook_size, Iterable) else [codebook_size for _ in range(codebook_depth)]
	self.decay = decay if isinstance(decay, Iterable) else [decay for _ in range(codebook_depth)]
	self.codebook_depth = codebook_depth

	if self.shared_codebook:
	codebook0 = VQEmebedding(codebook_size=self.codebook_size[0],
	embedding_dim=latent_dim, decay=self.decay[0], restart_unused_codes=restart_unused_codes,)
	self.codebooks = nn.ModuleList([codebook0 for _ in range(codebook_depth)])
	else:
	codebooks = [VQEmebedding(self.codebook_size[idx],
	latent_dim,
	decay=self.decay[idx],
	restart_unused_codes=restart_unused_codes,)
	for idx in range(codebook_depth)]
	self.codebooks = nn.ModuleList(codebooks)

	self.commitment_loss = commitment_loss

	def quantize(self, x, freeze_codebook=False):
	B, L, C = x.shape

	residual_feature = x.detach().clone()

	quant_list = []
	code_list = []
	aggregated_quants = torch.zeros_like(x)
	for i in range(self.codebook_depth):
	quant, code = self.codebooks[i](residual_feature, freeze_codebook)

	residual_feature.sub_(quant)
	aggregated_quants.add_(quant)

	quant_list.append(aggregated_quants.clone())
	code_list.append(code.unsqueeze(-1))

	codes = torch.cat(code_list, dim=-1)
	return quant_list, codes

	def compute_commitment_loss(self, x, quant_list):
	r"""
	Compute the commitment loss for the residual quantization.
	The loss is iteratively computed by aggregating quantized features.
	"""
	loss_list = []

	for idx, quant in enumerate(quant_list):
	partial_loss = (x - quant.detach()).pow(2.0).mean()
	loss_list.append(partial_loss)

	commitment_loss = torch.mean(torch.stack(loss_list))
	return commitment_loss

	@torch.no_grad()
	def embed_code(self, code):
	# N, 4

	fake_code = code
	fake_code[code == -1] = 0
	code_slices = torch.chunk(fake_code, chunks=self.codebook_depth, dim=-1)

	if self.shared_codebook:
	embeds = [self.codebooks[0].embed(code_slice) for i, code_slice in enumerate(code_slices)]
	else:
	embeds = [self.codebooks[i].embed(code_slice) for i, code_slice in enumerate(code_slices)]

	embeds = torch.cat(embeds, dim=-2)
	sum_embeds = []
	for _embeds_, _code_ in zip(embeds, code):
	valid_mask = _code_ != -1
	sum_embeds.append(_embeds_[valid_mask].sum(0))

	return torch.stack(sum_embeds, dim=0)

	# embeds = torch.cat(embeds, dim=-2).sum(-2)

	# return embeds

	def forward(self, x, freeze_codebook=False):
	quant_list, codes = self.quantize(x, freeze_codebook)

	commitment_loss = self.compute_commitment_loss(x, quant_list)
	quants_trunc = quant_list[-1]
	quants_trunc = x + (quants_trunc - x).detach()

	return quants_trunc, commitment_loss, codes


	@dataclass
	class VQ_SAM2ModelOutput(ModelOutput):
	"""
	Base class for VQ_SAM2's output

	"""
	loss: Optional[torch.FloatTensor] = None
	loss_recon: Optional[torch.FloatTensor] = None
	loss_quant: Optional[torch.FloatTensor] = None
	pred_masks: Optional[torch.FloatTensor] = None
	continues_mask_embeds: Optional[torch.FloatTensor] = None
	quant_mask_embeds: Optional[torch.FloatTensor] = None
	quant_codes: Optional[torch.LongTensor] = None



	class VQ_SAM2(PreTrainedModel):
	base_model_prefix = ""
	config_class = VQ_SAM2Config
	_no_split_modules = ["MultiScaleBlock", "TwoWayAttentionBlock"]

	def __init__(self, config):
	super().__init__(config)
	self.model = SAM2Model._from_config(config.sam2_config)

	sam_hidden_dim = 256
	self.num_mask_tokens = int(os.environ.get("MASK_TOKENIZER_NUM_MASK_TOKEN", 1))
	if self.num_mask_tokens > 1:
	self.concate_mask_embeds = nn.Sequential(
	nn.LayerNorm(sam_hidden_dim * self.num_mask_tokens),
	nn.Linear(sam_hidden_dim * self.num_mask_tokens, config.latent_dim),
	nn.GELU(),
	nn.Linear(config.latent_dim, config.latent_dim)
	)
	self.deconcate_quant_embed = nn.Sequential(
	nn.LayerNorm(config.latent_dim),
	nn.Linear(config.latent_dim, sam_hidden_dim * self.num_mask_tokens),
	nn.GELU(),
	nn.Linear(sam_hidden_dim * self.num_mask_tokens, sam_hidden_dim * self.num_mask_tokens)
	)
	else:
	self.concate_mask_embeds = nn.Identity()
	self.deconcate_quant_embed = nn.Identity()

	self.quantizer = ResidualQuantizer(
	codebook_size=config.codebook_size,
	latent_dim=config.latent_dim,
	codebook_depth=config.codebook_depth,
	shared_codebook=config.shared_codebook,
	restart_unused_codes=True,
	)

	def forward_with_codes(self, pixel_values, quant_codes):
	batch_size = len(quant_codes)
	pixel_values = torch.stack([
	self.model.preprocess_image(pixel) for pixel in pixel_values
	])
	sam2_states = self.model.get_sam2_embeddings(pixel_values, expand_size=1)

	quant_mask_embeds = self.quantizer.embed_code(quant_codes)
	quant_mask_embeds = quant_mask_embeds.unsqueeze(1)
	quant_mask_embeds = self.deconcate_quant_embed(quant_mask_embeds)
	quant_mask_embeds = quant_mask_embeds.reshape(batch_size, self.num_mask_tokens, -1).contiguous()

	pred_masks = self.model.inject_language_embd(sam2_states, quant_mask_embeds, nf_nobj=(batch_size, 1))

	return pred_masks

	def forward_with_embeds(self, pixel_values, embeds):
	batch_size = len(embeds)
	pixel_values = torch.stack([
	self.model.preprocess_image(pixel) for pixel in pixel_values
	])
	sam2_states = self.model.get_sam2_embeddings(pixel_values, expand_size=1)
	embeds = embeds.unsqueeze(1)

	pred_masks = self.model.inject_language_embd(sam2_states, embeds, nf_nobj=(batch_size, 1))

	return pred_masks


	@can_return_tuple
	def forward(
	self,
	pixel_values: Optional[torch.Tensor] = None,
	gt_masks: Optional[list[torch.Tensor]] = None,
	gt_boxes: Optional[torch.Tensor] = None,
	reconstruct_mask = True,
	freeze_codebook = False,
	) -> VQ_SAM2ModelOutput:
	"""
	Args:
	image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, optional).

	"""
	assert gt_boxes is not None, "Tokenizer works better given bbox prompt"

	batch_size = len(pixel_values)
	pixel_values = torch.stack([
	self.model.preprocess_image(pixel) for pixel in pixel_values
	])
	sam2_states = self.model.get_sam2_embeddings(pixel_values, expand_size=1)

	mask_embeds = self.model.encode_mask_box_input(sam2_states, gt_masks, gt_boxes)

	mask_embeds = mask_embeds.reshape(batch_size, 1, -1).contiguous()
	mask_embeds = self.concate_mask_embeds(mask_embeds)
	quant_mask_embeds, quant_loss, code = self.quantizer(mask_embeds, freeze_codebook)
	if not reconstruct_mask:
	return VQ_SAM2ModelOutput(
	quant_codes=code,
	)

	quant_mask_embeds = self.deconcate_quant_embed(quant_mask_embeds)
	quant_mask_embeds = quant_mask_embeds.reshape(batch_size, self.num_mask_tokens, -1).contiguous()

	pred_masks = self.model.inject_language_embd(sam2_states, quant_mask_embeds, nf_nobj=(batch_size, 1))

	if self.training and gt_masks is not None:
	return None
	else:
	return VQ_SAM2ModelOutput(
	pred_masks=pred_masks,
	continues_mask_embeds=mask_embeds,
	quant_mask_embeds=quant_mask_embeds,
	quant_codes=code,
	)