From 600a78f00dad624b7d211e5a9668bd960a3350d5 Mon Sep 17 00:00:00 2001 From: Kazuki Kyakuno Date: Fri, 20 Dec 2024 13:22:46 +0900 Subject: [PATCH 1/5] Added SAM2.1 --- .../sam2_video_predictor.py | 67 ++++++++++---- .../segment-anything-2/segment-anything-2.py | 90 +++++++++++++------ 2 files changed, 115 insertions(+), 42 deletions(-) diff --git a/image_segmentation/segment-anything-2/sam2_video_predictor.py b/image_segmentation/segment-anything-2/sam2_video_predictor.py index 537612f69..04973b5f8 100644 --- a/image_segmentation/segment-anything-2/sam2_video_predictor.py +++ b/image_segmentation/segment-anything-2/sam2_video_predictor.py @@ -130,6 +130,7 @@ def init_state( self, num_maskmem = 7, # default 1 input frame + 6 previous frames max_obj_ptrs_in_encoder = 16, + version = "2.1" ): """default state from yaml""" self.image_size = 1024 @@ -140,6 +141,9 @@ def init_state( self.training = False self.mem_dim = 64 self.add_tpos_enc_to_obj_ptrs = False + self.version = version + if version == "2.1": + self.add_tpos_enc_to_obj_ptrs = True self.use_obj_ptrs_in_encoder = True self.add_all_frames_to_correct_as_cond = False self.multimask_output_in_sam = True @@ -152,6 +156,8 @@ def init_state( self.use_obj_ptrs_in_encoder = True self.use_mlp_for_obj_ptr_proj = True self.proj_tpos_enc_in_obj_ptrs = False + if version == "2.1": + self.proj_tpos_enc_in_obj_ptrs = True self.soft_no_obj_ptr = False self.fixed_no_obj_ptr = True self.non_overlap_masks_for_mem_enc = False @@ -667,7 +673,7 @@ def _consolidate_temp_output_across_obj( return consolidated_out - def _get_empty_mask_ptr(self, inference_state, frame_idx, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp): + def _get_empty_mask_ptr(self, inference_state, frame_idx, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj): """Get a dummy object pointer based on an empty mask on the current frame.""" # A dummy (empty) mask with a single object batch_size = 1 @@ -703,11 +709,12 @@ def _get_empty_mask_ptr(self, inference_state, frame_idx, image_encoder, prompt_ mask_decoder=mask_decoder, memory_attention=memory_attention, memory_encoder=memory_encoder, - mlp=mlp + mlp=mlp, + obj_ptr_tpos_proj=obj_ptr_tpos_proj ) return current_out["obj_ptr"] - def propagate_in_video_preflight(self, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp): + def propagate_in_video_preflight(self, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj): assert(memory_encoder!=None) """Prepare inference_state and consolidate temporary outputs before tracking.""" # Tracking has started and we don't allow adding new objects until session is reset. @@ -788,6 +795,7 @@ def propagate_in_video( memory_attention=None, memory_encoder=None, mlp=None, + obj_ptr_tpos_proj=None, frame_idx = 0 ): """Propagate the input points across frames to track in the entire video.""" @@ -1000,7 +1008,8 @@ def _run_single_frame_inference( mask_decoder=None, memory_attention=None, memory_encoder=None, - mlp=None + mlp=None, + obj_ptr_tpos_proj=None ): """Run tracking on a single frame based on current inputs and previous memory.""" # Retrieve correct image features @@ -1031,7 +1040,8 @@ def _run_single_frame_inference( mask_decoder=mask_decoder, memory_attention=memory_attention, memory_encoder=memory_encoder, - mlp=mlp + mlp=mlp, + obj_ptr_tpos_proj=obj_ptr_tpos_proj ) # optionally offload the output to CPU memory to save GPU space @@ -1158,7 +1168,8 @@ def track_step( mask_decoder=None, memory_attention=None, memory_encoder=None, - mlp=None + mlp=None, + obj_ptr_tpos_proj=None ): current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs} # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW @@ -1196,6 +1207,7 @@ def track_step( num_frames=num_frames, track_in_reverse=track_in_reverse, memory_attention=memory_attention, + obj_ptr_tpos_proj=obj_ptr_tpos_proj, ) # apply SAM-style segmentation head # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder, @@ -1565,7 +1577,8 @@ def _prepare_memory_conditioned_features( output_dict, num_frames, track_in_reverse=False, # tracking in reverse time order (for demo usage) - memory_attention=None + memory_attention=None, + obj_ptr_tpos_proj=None ): """Fuse the current frame's visual feature map with previous memory.""" B = current_vision_feats[-1].shape[1] # batch size on this frame @@ -1685,7 +1698,15 @@ def _prepare_memory_conditioned_features( tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim obj_pos = pos_list obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim) - #obj_pos = self.obj_ptr_tpos_proj(obj_pos) # identity + + tpos = np.zeros((obj_pos.shape[0], 64)) + for i in range(obj_pos.shape[0]): + if self.onnx: + tpos[i:i+1,:] = obj_ptr_tpos_proj.run(None, {"obj_pos": obj_pos[i:i+1,:].astype(np.float32)})[0] + else: + tpos[i:i+1,:] = obj_ptr_tpos_proj.run({"obj_pos": obj_pos[i:i+1,:].astype(np.float32)})[0] + obj_pos = tpos + obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim) else: obj_pos = np.zeros((len(pos_list), B, self.mem_dim), dtype=np.float32) @@ -1728,20 +1749,34 @@ def _prepare_memory_conditioned_features( if self.benchmark: start = int(round(time.time() * 1000)) - if self.normal: - if self.onnx: - pix_feat_with_mem = memory_attention.run(None, {"curr":current_vision_feats[0], "memory":memory, "curr_pos":current_vision_pos_embeds[0], "memory_pos":memory_pos_embed, "num_obj_ptr_tokens":num_obj_ptr_tokens_numpy}) - else: - pix_feat_with_mem = memory_attention.run({"curr":current_vision_feats[0], "memory":memory, "curr_pos":current_vision_pos_embeds[0], "memory_pos":memory_pos_embed, "num_obj_ptr_tokens":num_obj_ptr_tokens_numpy}) - else: + if self.version == "2.1": memory_1 = memory[:-num_obj_ptr_tokens,:,:] memory_2 = memory[-num_obj_ptr_tokens:,:,:] memory_pos_embed_1 = memory_pos_embed[:-num_obj_ptr_tokens,:,:] memory_pos_embed_2 = memory_pos_embed[-num_obj_ptr_tokens:,:,:] + attention_mask_1 = np.zeros((memory_1.shape[0], memory_1.shape[1]), dtype=np.bool_) + attention_mask_2 = np.zeros((memory_2.shape[0], memory_2.shape[1]), dtype=np.bool_) + attention_mask_1[:memory_1.shape[0],:] = True + attention_mask_2[:memory_2.shape[0],:] = True if self.onnx: - pix_feat_with_mem = memory_attention.run(None, {"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2}) + pix_feat_with_mem = memory_attention.run(None, {"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2, "attention_mask_1":attention_mask_1, "attention_mask_2":attention_mask_2}) else: - pix_feat_with_mem = memory_attention.run({"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2}) + pix_feat_with_mem = memory_attention.run({"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2, "attention_mask_1":attention_mask_1, "attention_mask_2":attention_mask_2}) + else: + if self.normal: + if self.onnx: + pix_feat_with_mem = memory_attention.run(None, {"curr":current_vision_feats[0], "memory":memory, "curr_pos":current_vision_pos_embeds[0], "memory_pos":memory_pos_embed, "num_obj_ptr_tokens":num_obj_ptr_tokens_numpy}) + else: + pix_feat_with_mem = memory_attention.run({"curr":current_vision_feats[0], "memory":memory, "curr_pos":current_vision_pos_embeds[0], "memory_pos":memory_pos_embed, "num_obj_ptr_tokens":num_obj_ptr_tokens_numpy}) + else: + memory_1 = memory[:-num_obj_ptr_tokens,:,:] + memory_2 = memory[-num_obj_ptr_tokens:,:,:] + memory_pos_embed_1 = memory_pos_embed[:-num_obj_ptr_tokens,:,:] + memory_pos_embed_2 = memory_pos_embed[-num_obj_ptr_tokens:,:,:] + if self.onnx: + pix_feat_with_mem = memory_attention.run(None, {"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2}) + else: + pix_feat_with_mem = memory_attention.run({"curr":current_vision_feats[0], "memory_1":memory_1, "memory_2":memory_2, "curr_pos":current_vision_pos_embeds[0], "memory_pos_1":memory_pos_embed_1, "memory_pos_2":memory_pos_embed_2}) if self.benchmark: end = int(round(time.time() * 1000)) diff --git a/image_segmentation/segment-anything-2/segment-anything-2.py b/image_segmentation/segment-anything-2/segment-anything-2.py index 54a5b3fea..fa7d1f409 100644 --- a/image_segmentation/segment-anything-2/segment-anything-2.py +++ b/image_segmentation/segment-anything-2/segment-anything-2.py @@ -22,8 +22,6 @@ # Parameters # ====================== -REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/segment-anything-2/' - IMAGE_PATH = 'truck.jpg' SAVE_IMAGE_PATH = 'output.png' @@ -60,7 +58,7 @@ help='Number of obj ptr in encoder.' ) parser.add_argument( - '-m', '--model_type', default='hiera_l', choices=('hiera_l', 'hiera_b+', 'hiera_s', 'hiera_t'), + '-m', '--model_type', default='hiera_t', choices=('hiera_l', 'hiera_b+', 'hiera_s', 'hiera_t'), help='Select model.' ) parser.add_argument( @@ -71,9 +69,24 @@ '--normal', action='store_true', help='Use normal version of onnx model. Normal version requires 6 dim matmul.' ) +parser.add_argument( + '--version', default='2.1', choices=('2', '2.1'), + help='Select model.' +) args = update_parser(parser) + +# ====================== +# Model path +# ====================== + +if args.version == "2.1": + REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/segment-anything-2.1/' +else: + REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/segment-anything-2/' + + # ====================== # Utility # ====================== @@ -238,7 +251,7 @@ def preprocess_frame(img, image_size): return img -def recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp): +def recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj): image_size = 1024 if args.video == "demo": @@ -266,7 +279,7 @@ def recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_att predictor = SAM2VideoPredictor(args.onnx, args.normal, args.benchmark) - inference_state = predictor.init_state(args.num_mask_mem, args.max_obj_ptrs_in_encoder) + inference_state = predictor.init_state(args.num_mask_mem, args.max_obj_ptrs_in_encoder, args.version) predictor.reset_state(inference_state) frame_shown = False @@ -301,9 +314,9 @@ def recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_att image_encoder) if frame_idx == 0: - annotate_frame(input_point, input_label, input_box, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp) + annotate_frame(input_point, input_label, input_box, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj) - frame = process_frame(frame, frame_idx, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp) + frame = process_frame(frame, frame_idx, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj) frame = frame.astype(np.uint8) if frame_idx == 0: @@ -328,7 +341,7 @@ def recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_att if writer is not None: writer.release() -def annotate_frame(points, labels, box, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp): +def annotate_frame(points, labels, box, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj): ann_frame_idx = 0 # the frame index we interact with ann_obj_id = 1 # give a unique id to each object we interact with (it can be any integers) @@ -353,9 +366,10 @@ def annotate_frame(points, labels, box, predictor, inference_state, image_encode mask_decoder = mask_decoder, memory_attention = memory_attention, memory_encoder = memory_encoder, - mlp = mlp) + mlp = mlp, + obj_ptr_tpos_proj = obj_ptr_tpos_proj) -def process_frame(image, frame_idx, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp): +def process_frame(image, frame_idx, predictor, inference_state, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj): out_frame_idx, out_obj_ids, out_mask_logits = predictor.propagate_in_video(inference_state, image_encoder = image_encoder, prompt_encoder = prompt_encoder, @@ -363,6 +377,7 @@ def process_frame(image, frame_idx, predictor, inference_state, image_encoder, p memory_attention = memory_attention, memory_encoder = memory_encoder, mlp = mlp, + obj_ptr_tpos_proj = obj_ptr_tpos_proj, frame_idx = frame_idx) image = show_mask((out_mask_logits[0] > 0.0), image, color = np.array([30, 144, 255]), obj_id = out_obj_ids[0]) @@ -372,24 +387,37 @@ def process_frame(image, frame_idx, predictor, inference_state, image_encoder, p def main(): # fetch image encoder model - WEIGHT_IMAGE_ENCODER_L_PATH = 'image_encoder_'+args.model_type+'.onnx' - MODEL_IMAGE_ENCODER_L_PATH = 'image_encoder_'+args.model_type+'.onnx.prototxt' - WEIGHT_PROMPT_ENCODER_L_PATH = 'prompt_encoder_'+args.model_type+'.onnx' - MODEL_PROMPT_ENCODER_L_PATH = 'prompt_encoder_'+args.model_type+'.onnx.prototxt' - WEIGHT_MASK_DECODER_L_PATH = 'mask_decoder_'+args.model_type+'.onnx' - MODEL_MASK_DECODER_L_PATH = 'mask_decoder_'+args.model_type+'.onnx.prototxt' - if args.normal: + model_type = args.model_type + if args.version == "2.1": + model_type = model_type + "_2.1" + WEIGHT_IMAGE_ENCODER_L_PATH = 'image_encoder_'+model_type+'.onnx' + MODEL_IMAGE_ENCODER_L_PATH = 'image_encoder_'+model_type+'.onnx.prototxt' + WEIGHT_PROMPT_ENCODER_L_PATH = 'prompt_encoder_'+model_type+'.onnx' + MODEL_PROMPT_ENCODER_L_PATH = 'prompt_encoder_'+model_type+'.onnx.prototxt' + WEIGHT_MASK_DECODER_L_PATH = 'mask_decoder_'+model_type+'.onnx' + MODEL_MASK_DECODER_L_PATH = 'mask_decoder_'+model_type+'.onnx.prototxt' + if args.normal or args.version == "2.1": # 6dim matmul - WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+args.model_type+'.onnx' - MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+args.model_type+'.onnx.prototxt' + WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.onnx' + MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.onnx.prototxt' else: # 4dim matmul with batch 1 - WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+args.model_type+'.opt.onnx' - MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+args.model_type+'.opt.onnx.prototxt' - WEIGHT_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+args.model_type+'.onnx' - MODEL_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+args.model_type+'.onnx.prototxt' - WEIGHT_MLP_L_PATH = 'mlp_'+args.model_type+'.onnx' - MODEL_MLP_L_PATH = 'mlp_'+args.model_type+'.onnx.prototxt' + WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx' + MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx.prototxt' + WEIGHT_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx' + MODEL_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx.prototxt' + WEIGHT_MLP_L_PATH = 'mlp_'+model_type+'.onnx' + MODEL_MLP_L_PATH = 'mlp_'+model_type+'.onnx.prototxt' + WEIGHT_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx' + MODEL_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx.prototxt' + if args.version == "2.1": + MODEL_IMAGE_ENCODER_L_PATH = None + MODEL_PROMPT_ENCODER_L_PATH = None + MODEL_MASK_DECODER_L_PATH = None + MODEL_MEMORY_ATTENTION_L_PATH = None + MODEL_MEMORY_ENCODER_L_PATH = None + MODEL_MLP_L_PATH = None + MODEL_TPOS_L_PATH = None # model files check and download check_and_download_models(WEIGHT_IMAGE_ENCODER_L_PATH, MODEL_IMAGE_ENCODER_L_PATH, REMOTE_PATH) @@ -398,6 +426,8 @@ def main(): check_and_download_models(WEIGHT_MEMORY_ATTENTION_L_PATH, MODEL_MEMORY_ATTENTION_L_PATH, REMOTE_PATH) check_and_download_models(WEIGHT_MEMORY_ENCODER_L_PATH, MODEL_MEMORY_ENCODER_L_PATH, REMOTE_PATH) check_and_download_models(WEIGHT_MLP_L_PATH, MODEL_MLP_L_PATH, REMOTE_PATH) + if args.version == "2.1": + check_and_download_models(WEIGHT_TPOS_L_PATH, MODEL_TPOS_L_PATH, REMOTE_PATH) if args.onnx: import onnxruntime @@ -407,6 +437,10 @@ def main(): memory_attention = onnxruntime.InferenceSession(WEIGHT_MEMORY_ATTENTION_L_PATH) memory_encoder = onnxruntime.InferenceSession(WEIGHT_MEMORY_ENCODER_L_PATH) mlp = onnxruntime.InferenceSession(WEIGHT_MLP_L_PATH) + if args.version == "2.1": + obj_ptr_tpos_proj = onnxruntime.InferenceSession(WEIGHT_TPOS_L_PATH) + else: + obj_ptr_tpos_proj = None else: import ailia memory_mode = ailia.get_memory_mode(reduce_constant=True, ignore_input_with_initializer=True, reduce_interstage=False, reuse_interstage=True) @@ -416,9 +450,13 @@ def main(): memory_attention = ailia.Net(weight=WEIGHT_MEMORY_ATTENTION_L_PATH, stream=MODEL_MEMORY_ATTENTION_L_PATH, memory_mode=memory_mode, env_id=args.env_id) memory_encoder = ailia.Net(weight=WEIGHT_MEMORY_ENCODER_L_PATH, stream=MODEL_MEMORY_ENCODER_L_PATH, memory_mode=memory_mode, env_id=args.env_id) mlp = ailia.Net(weight=WEIGHT_MLP_L_PATH, stream=MODEL_MLP_L_PATH, memory_mode=memory_mode, env_id=args.env_id) + if args.version == "2.1": + obj_ptr_tpos_proj = ailia.Net(weight=WEIGHT_TPOS_L_PATH, stream=MODEL_TPOS_L_PATH, memory_mode=memory_mode, env_id=args.env_id) + else: + obj_ptr_tpos_proj = None if args.video is not None: - recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp) + recognize_from_video(image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj) else: recognize_from_image(image_encoder, prompt_encoder, mask_decoder) From 1dcd89898974d95512ce48f88c80e490c674ae19 Mon Sep 17 00:00:00 2001 From: Kazuki Kyakuno Date: Sun, 29 Dec 2024 09:56:59 +0900 Subject: [PATCH 2/5] Added model list to readme --- .../segment-anything-2/README.md | 63 +++++++++++++++++-- .../segment-anything-2/segment-anything-2.py | 25 ++++---- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/image_segmentation/segment-anything-2/README.md b/image_segmentation/segment-anything-2/README.md index b94e69f05..0caaee000 100644 --- a/image_segmentation/segment-anything-2/README.md +++ b/image_segmentation/segment-anything-2/README.md @@ -84,6 +84,11 @@ By adding the `--model_type` option, you can specify model type which is selecte $ python3 segment-anything-2.py --model_type hiera_l ``` +By adding the `--version` option, you can specify model type which is selected from "2" and "2.1". (default is 2) +```bash +$ python3 segment-anything-2.py --version "2.1" +``` + To improve the performance of MemoryAttention, you can also reduce the number of reference images in past frames, which is num_mask_mem. ```bash $ python3 segment-anything-2.py -v 0 --num_mask_mem 2 --max_obj_ptrs_in_encoder 2 @@ -103,7 +108,9 @@ ONNX opset=17 ## Netron -### hiera_l +### SAM2 + +#### hiera_l - [image_encoder_hiera_l.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/image_encoder_hiera_l.onnx.prototxt) - [mask_decoder_hiera_l.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mask_decoder_hiera_l.onnx.prototxt) @@ -113,7 +120,7 @@ ONNX opset=17 - [memory_encoder_hiera_l.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/memory_encoder_hiera_l.onnx.prototxt) - [mlp_hiera_l.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mlp_hiera_l.onnx.prototxt) -### hiera_b+ +#### hiera_b+ - [image_encoder_hiera_b+.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/image_encoder_hiera_b+.onnx.prototxt) - [mask_decoder_hiera_b+.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mask_decoder_hiera_b+.onnx.prototxt) @@ -123,7 +130,7 @@ ONNX opset=17 - [memory_encoder_hiera_b+.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/memory_encoder_hiera_b+.onnx.prototxt) - [mlp_hiera_b+.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mlp_hiera_b+.onnx.prototxt) -### hiera_s +#### hiera_s - [image_encoder_hiera_s.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/image_encoder_hiera_s.onnx.prototxt) - [mask_decoder_hiera_s.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mask_decoder_hiera_s.onnx.prototxt) @@ -133,7 +140,7 @@ ONNX opset=17 - [memory_encoder_hiera_s.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/memory_encoder_hiera_s.onnx.prototxt) - [mlp_hiera_s.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mlp_hiera_s.onnx.prototxt) -### hiera_t +#### hiera_t - [image_encoder_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/image_encoder_hiera_t.onnx.prototxt) - [mask_decoder_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mask_decoder_hiera_t.onnx.prototxt) @@ -143,6 +150,52 @@ ONNX opset=17 - [memory_encoder_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/memory_encoder_hiera_t.onnx.prototxt) - [mlp_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mlp_hiera_t.onnx.prototxt) -### Optimized model +#### Model information memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can be implemented using a 4-dimensional Matmul instead of fixing the batch size to 1. + +### SAM2.1 + +#### hiera_l + +- [image_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_l_2.1.onnx.prototxt) +- [mask_decoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_l_2.1.onnx.prototxt) +- [prompt_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_l_2.1.onnx.prototxt) +- [memory_attention_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_l_2.1.onnx.prototxt) +- [memory_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_l_2.1.onnx.prototxt) +- [mlp_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_l_2.1.onnx.prototxt) +- [obj_ptr_tpos_proj_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_l_2.1.onnx.prototxt) + +#### hiera_b+ + +- [image_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_b+_2.1.onnx.prototxt) +- [mask_decoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_b+_2.1.onnx.prototxt) +- [prompt_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_b+_2.1.onnx.prototxt) +- [memory_attention_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_b+_2.1.onnx.prototxt) +- [memory_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_b+_2.1.onnx.prototxt) +- [mlp_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_b+_2.1.onnx.prototxt) +- [obj_ptr_tpos_proj_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_b+_2.1.onnx.prototxt) + +#### hiera_s + +- [image_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_s_2.1.onnx.prototxt) +- [mask_decoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_s_2.1.onnx.prototxt) +- [prompt_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_s_2.1.onnx.prototxt) +- [memory_attention_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_s_2.1.onnx.prototxt) +- [memory_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_s_2.1.onnx.prototxt) +- [mlp_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_s_2.1.onnx.prototxt) +- [obj_ptr_tpos_proj_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_s_2.1.onnx.prototxt) + +#### hiera_t + +- [image_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_t_2.1.onnx.prototxt) +- [mask_decoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_t_2.1.onnx.prototxt) +- [prompt_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_t_2.1.onnx.prototxt) +- [memory_attention_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_t_2.1.onnx.prototxt) +- [memory_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_t_2.1.onnx.prototxt) +- [mlp_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_t_2.1.onnx.prototxt) +- [obj_ptr_tpos_proj_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_t_2.1.onnx.prototxt) + +#### Model information + +memory_attention.onnx can be implemented using a 4-dimensional Matmul instead of fixing the batch size to 1. diff --git a/image_segmentation/segment-anything-2/segment-anything-2.py b/image_segmentation/segment-anything-2/segment-anything-2.py index fa7d1f409..acb8bd18e 100644 --- a/image_segmentation/segment-anything-2/segment-anything-2.py +++ b/image_segmentation/segment-anything-2/segment-anything-2.py @@ -70,7 +70,7 @@ help='Use normal version of onnx model. Normal version requires 6 dim matmul.' ) parser.add_argument( - '--version', default='2.1', choices=('2', '2.1'), + '--version', default='2', choices=('2', '2.1'), help='Select model.' ) @@ -396,27 +396,28 @@ def main(): MODEL_PROMPT_ENCODER_L_PATH = 'prompt_encoder_'+model_type+'.onnx.prototxt' WEIGHT_MASK_DECODER_L_PATH = 'mask_decoder_'+model_type+'.onnx' MODEL_MASK_DECODER_L_PATH = 'mask_decoder_'+model_type+'.onnx.prototxt' - if args.normal or args.version == "2.1": + if args.normal: # 6dim matmul + if args.version == "2.1": + raise Exception("SAM2.1 not exported normal model.") WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.onnx' MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.onnx.prototxt' else: # 4dim matmul with batch 1 - WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx' - MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx.prototxt' + opt_id = ".opt" + if args.version == "2.1": + opt_id = "" + WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+opt_id+'.onnx' + MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+opt_id+'.onnx.prototxt' WEIGHT_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx' MODEL_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx.prototxt' WEIGHT_MLP_L_PATH = 'mlp_'+model_type+'.onnx' MODEL_MLP_L_PATH = 'mlp_'+model_type+'.onnx.prototxt' - WEIGHT_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx' - MODEL_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx.prototxt' if args.version == "2.1": - MODEL_IMAGE_ENCODER_L_PATH = None - MODEL_PROMPT_ENCODER_L_PATH = None - MODEL_MASK_DECODER_L_PATH = None - MODEL_MEMORY_ATTENTION_L_PATH = None - MODEL_MEMORY_ENCODER_L_PATH = None - MODEL_MLP_L_PATH = None + WEIGHT_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx' + MODEL_TPOS_L_PATH = 'obj_ptr_tpos_proj_'+model_type+'.onnx.prototxt' + else: + WEIGHT_TPOS_L_PATH = None MODEL_TPOS_L_PATH = None # model files check and download From ff84e58cee5746315ac0b5b94171cea95122e57f Mon Sep 17 00:00:00 2001 From: Kazuki Kyakuno Date: Sun, 29 Dec 2024 09:57:40 +0900 Subject: [PATCH 3/5] Change default model --- image_segmentation/segment-anything-2/segment-anything-2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image_segmentation/segment-anything-2/segment-anything-2.py b/image_segmentation/segment-anything-2/segment-anything-2.py index acb8bd18e..52eb6e849 100644 --- a/image_segmentation/segment-anything-2/segment-anything-2.py +++ b/image_segmentation/segment-anything-2/segment-anything-2.py @@ -58,7 +58,7 @@ help='Number of obj ptr in encoder.' ) parser.add_argument( - '-m', '--model_type', default='hiera_t', choices=('hiera_l', 'hiera_b+', 'hiera_s', 'hiera_t'), + '-m', '--model_type', default='hiera_l', choices=('hiera_l', 'hiera_b+', 'hiera_s', 'hiera_t'), help='Select model.' ) parser.add_argument( From 6137a5febc5c5b0633e1cc1014b5d3ca1ed79ea6 Mon Sep 17 00:00:00 2001 From: Kazuki Kyakuno Date: Sun, 29 Dec 2024 10:03:09 +0900 Subject: [PATCH 4/5] Fix model id --- image_segmentation/segment-anything-2/README.md | 16 ++++++---------- .../segment-anything-2/segment-anything-2.py | 7 ++----- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/image_segmentation/segment-anything-2/README.md b/image_segmentation/segment-anything-2/README.md index 0caaee000..adfda579c 100644 --- a/image_segmentation/segment-anything-2/README.md +++ b/image_segmentation/segment-anything-2/README.md @@ -150,10 +150,6 @@ ONNX opset=17 - [memory_encoder_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/memory_encoder_hiera_t.onnx.prototxt) - [mlp_hiera_t.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2/mlp_hiera_t.onnx.prototxt) -#### Model information - -memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can be implemented using a 4-dimensional Matmul instead of fixing the batch size to 1. - ### SAM2.1 #### hiera_l @@ -161,7 +157,7 @@ memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can - [image_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_l_2.1.onnx.prototxt) - [mask_decoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_l_2.1.onnx.prototxt) - [prompt_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_l_2.1.onnx.prototxt) -- [memory_attention_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_l_2.1.onnx.prototxt) +- [memory_attention_hiera_l_2.1.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_l_2.1.opt.onnx.prototxt) - [memory_encoder_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_l_2.1.onnx.prototxt) - [mlp_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_l_2.1.onnx.prototxt) - [obj_ptr_tpos_proj_hiera_l_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_l_2.1.onnx.prototxt) @@ -171,7 +167,7 @@ memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can - [image_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_b+_2.1.onnx.prototxt) - [mask_decoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_b+_2.1.onnx.prototxt) - [prompt_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_b+_2.1.onnx.prototxt) -- [memory_attention_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_b+_2.1.onnx.prototxt) +- [memory_attention_hiera_b+_2.1.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_b+_2.1.opt.onnx.prototxt) - [memory_encoder_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_b+_2.1.onnx.prototxt) - [mlp_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_b+_2.1.onnx.prototxt) - [obj_ptr_tpos_proj_hiera_b+_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_b+_2.1.onnx.prototxt) @@ -181,7 +177,7 @@ memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can - [image_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_s_2.1.onnx.prototxt) - [mask_decoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_s_2.1.onnx.prototxt) - [prompt_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_s_2.1.onnx.prototxt) -- [memory_attention_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_s_2.1.onnx.prototxt) +- [memory_attention_hiera_s_2.1.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_s_2.1.opt.onnx.prototxt) - [memory_encoder_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_s_2.1.onnx.prototxt) - [mlp_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_s_2.1.onnx.prototxt) - [obj_ptr_tpos_proj_hiera_s_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_s_2.1.onnx.prototxt) @@ -191,11 +187,11 @@ memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can - [image_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/image_encoder_hiera_t_2.1.onnx.prototxt) - [mask_decoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mask_decoder_hiera_t_2.1.onnx.prototxt) - [prompt_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/prompt_encoder_hiera_t_2.1.onnx.prototxt) -- [memory_attention_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_t_2.1.onnx.prototxt) +- [memory_attention_hiera_t_2.1.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_attention_hiera_t_2.1.opt.onnx.prototxt) - [memory_encoder_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/memory_encoder_hiera_t_2.1.onnx.prototxt) - [mlp_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/mlp_hiera_t_2.1.onnx.prototxt) - [obj_ptr_tpos_proj_hiera_t_2.1.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/segment-anything-2.1/obj_ptr_tpos_proj_hiera_t_2.1.onnx.prototxt) -#### Model information +## Model information -memory_attention.onnx can be implemented using a 4-dimensional Matmul instead of fixing the batch size to 1. +memory_attention.onnx uses a 6-dimensional Matmul. memory_attention.opt.onnx can be implemented using a 4-dimensional Matmul instead of fixing the batch size to 1. diff --git a/image_segmentation/segment-anything-2/segment-anything-2.py b/image_segmentation/segment-anything-2/segment-anything-2.py index 52eb6e849..48f7cc163 100644 --- a/image_segmentation/segment-anything-2/segment-anything-2.py +++ b/image_segmentation/segment-anything-2/segment-anything-2.py @@ -404,11 +404,8 @@ def main(): MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.onnx.prototxt' else: # 4dim matmul with batch 1 - opt_id = ".opt" - if args.version == "2.1": - opt_id = "" - WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+opt_id+'.onnx' - MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+opt_id+'.onnx.prototxt' + WEIGHT_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx' + MODEL_MEMORY_ATTENTION_L_PATH = 'memory_attention_'+model_type+'.opt.onnx.prototxt' WEIGHT_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx' MODEL_MEMORY_ENCODER_L_PATH = 'memory_encoder_'+model_type+'.onnx.prototxt' WEIGHT_MLP_L_PATH = 'mlp_'+model_type+'.onnx' From 2459c8b15239454148ec8a43f371ad1375c164fb Mon Sep 17 00:00:00 2001 From: Kazuki Kyakuno Date: Sun, 29 Dec 2024 10:14:47 +0900 Subject: [PATCH 5/5] Fix video mode --- .../sam2_video_predictor.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/image_segmentation/segment-anything-2/sam2_video_predictor.py b/image_segmentation/segment-anything-2/sam2_video_predictor.py index 04973b5f8..9e39a793d 100644 --- a/image_segmentation/segment-anything-2/sam2_video_predictor.py +++ b/image_segmentation/segment-anything-2/sam2_video_predictor.py @@ -65,8 +65,10 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000): dim_t = np.arange(pe_dim, dtype=np.float32) dim_t = temperature ** (2 * (dim_t // 2) / pe_dim) - pos_embed = pos_inds.unsqueeze(-1) / dim_t - pos_embed = np.concatenate([pos_embed.sin(), pos_embed.cos()], axis=-1) + #pos_embed = pos_inds.unsqueeze(-1) / dim_t + pos_embed = np.expand_dims(pos_inds, axis=-1) / dim_t + #pos_embed = np.concatenate([np.sin(pos_embed), np.cos(pos_embed)], axis=-1) + pos_embed = np.concatenate([np.sin(pos_embed), np.cos(pos_embed)], axis=-1) return pos_embed def concat_points(old_point_inputs, new_points, new_labels): @@ -296,7 +298,8 @@ def add_new_points_or_box( mask_decoder=None, memory_attention=None, memory_encoder=None, - mlp=None + mlp=None, + obj_ptr_tpos_proj=None ): """Add new points to a frame.""" obj_idx = self._obj_id_to_idx(inference_state, obj_id) @@ -408,7 +411,8 @@ def add_new_points_or_box( mask_decoder=mask_decoder, memory_attention=memory_attention, memory_encoder=memory_encoder, - mlp=mlp + mlp=mlp, + obj_ptr_tpos_proj=obj_ptr_tpos_proj ) # Add the output to the output dict (to be used as future memory) obj_temp_output_dict[storage_key][frame_idx] = current_out @@ -421,7 +425,7 @@ def add_new_points_or_box( is_cond=is_cond, run_mem_encoder=False, consolidate_at_video_res=True, - image_encoder=image_encoder, prompt_encoder=prompt_encoder, mask_decoder=mask_decoder, memory_encoder=memory_encoder, mlp=mlp + image_encoder=image_encoder, prompt_encoder=prompt_encoder, mask_decoder=mask_decoder, memory_encoder=memory_encoder, mlp=mlp, obj_ptr_tpos_proj=obj_ptr_tpos_proj ) _, video_res_masks = self._get_orig_video_res_output( inference_state, consolidated_out["pred_masks_video_res"] @@ -560,7 +564,8 @@ def _consolidate_temp_output_across_obj( mask_decoder=None, memory_attention=None, memory_encoder=None, - mlp=None + mlp=None, + obj_ptr_tpos_proj=None ): """ Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on @@ -625,7 +630,7 @@ def _consolidate_temp_output_across_obj( if run_mem_encoder: if empty_mask_ptr is None: empty_mask_ptr = self._get_empty_mask_ptr( - inference_state, frame_idx, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp + inference_state, frame_idx, image_encoder, prompt_encoder, mask_decoder, memory_attention, memory_encoder, mlp, obj_ptr_tpos_proj ) # fill object pointer with a dummy pointer (based on an empty mask) consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = empty_mask_ptr @@ -744,7 +749,7 @@ def propagate_in_video_preflight(self, inference_state, image_encoder, prompt_en consolidated_out = self._consolidate_temp_output_across_obj( inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True, image_encoder=image_encoder, prompt_encoder=prompt_encoder, mask_decoder=mask_decoder, memory_attention=memory_attention, - memory_encoder=memory_encoder, mlp=mlp + memory_encoder=memory_encoder, mlp=mlp, obj_ptr_tpos_proj=obj_ptr_tpos_proj ) # merge them into "output_dict" and also create per-object slices output_dict[storage_key][frame_idx] = consolidated_out @@ -860,7 +865,8 @@ def propagate_in_video( mask_decoder=mask_decoder, memory_attention=memory_attention, memory_encoder=memory_encoder, - mlp=mlp + mlp=mlp, + obj_ptr_tpos_proj=obj_ptr_tpos_proj ) output_dict[storage_key][frame_idx] = current_out # Create slices of per-object outputs for subsequent interaction with each @@ -1696,18 +1702,20 @@ def _prepare_memory_conditioned_features( if self.add_tpos_enc_to_obj_ptrs: t_diff_max = max_obj_ptrs_in_encoder - 1 tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim - obj_pos = pos_list + obj_pos = np.array(pos_list) obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim) - tpos = np.zeros((obj_pos.shape[0], 64)) + tpos = np.zeros((obj_pos.shape[0], 64), np.float32) for i in range(obj_pos.shape[0]): if self.onnx: - tpos[i:i+1,:] = obj_ptr_tpos_proj.run(None, {"obj_pos": obj_pos[i:i+1,:].astype(np.float32)})[0] + tpos[i:i+1,:] = obj_ptr_tpos_proj.run(None, {"x": obj_pos[i:i+1,:].astype(np.float32)})[0] else: - tpos[i:i+1,:] = obj_ptr_tpos_proj.run({"obj_pos": obj_pos[i:i+1,:].astype(np.float32)})[0] + tpos[i:i+1,:] = obj_ptr_tpos_proj.run({"x": obj_pos[i:i+1,:].astype(np.float32)})[0] obj_pos = tpos - obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim) + #obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim) + obj_pos_expanded = np.expand_dims(obj_pos, axis=1) # numpy + obj_pos = np.tile(obj_pos_expanded, (1, B, 1)) else: obj_pos = np.zeros((len(pos_list), B, self.mem_dim), dtype=np.float32) if self.mem_dim < C: