Missing implementation error for CoreML #1299

thewh1teagle · 2024-08-10T21:29:11Z

Describe the bug
When running optimized whisper medium int8 on macOS coreml I get the error

CreateSession(Msg("Could not find an implementation for DecoderMaskedMultiHeadAttention(1) node with name 'Attention_0'"))

To Reproduce
Enable CoreML and use medium.int8 gpu model from https://huggingface.co/thewh1teagle/whisper-olive/tree/main

Expected behavior
It should run without error

Olive config

{
    "input_model": {
        "type": "CompositeModel",
        "model_component_names": [
            "encoder_decoder_init",
            "decoder"
        ],
        "model_components": [
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_encoder_decoder_init",
                "io_config": "get_encdec_io_config",
                "dummy_inputs_func": "encoder_decoder_init_dummy_inputs"
            },
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_decoder",
                "io_config": "get_dec_io_config",
                "dummy_inputs_func": "decoder_dummy_inputs"
            }
        ],
        "model_attributes": {
            "vocab_size": 51865,
            "num_mel_bins": 80,
            "d_model": 1024,
            "encoder_layers": 24,
            "encoder_attention_heads": 16,
            "decoder_layers": 24,
            "decoder_attention_heads": 16,
            "decoder_ffn_dim": 4096,
            "encoder_ffn_dim": 4096,
            "dropout": 0.0,
            "attention_dropout": 0.0,
            "activation_dropout": 0.0,
            "activation_function": "gelu",
            "init_std": 0.02,
            "encoder_layerdrop": 0.0,
            "decoder_layerdrop": 0.0,
            "use_cache": true,
            "num_hidden_layers": 24,
            "scale_embedding": false,
            "max_source_positions": 1500,
            "max_target_positions": 448,
            "classifier_proj_size": 256,
            "use_weighted_layer_sum": false,
            "apply_spec_augment": false,
            "mask_time_prob": 0.05,
            "mask_time_length": 10,
            "mask_time_min_masks": 2,
            "mask_feature_prob": 0.0,
            "mask_feature_length": 10,
            "mask_feature_min_masks": 0,
            "median_filter_width": 7,
            "return_dict": true,
            "output_hidden_states": false,
            "output_attentions": false,
            "torchscript": false,
            "torch_dtype": "float32",
            "use_bfloat16": false,
            "tf_legacy_loss": false,
            "pruned_heads": {},
            "tie_word_embeddings": true,
            "chunk_size_feed_forward": 0,
            "is_encoder_decoder": true,
            "is_decoder": false,
            "cross_attention_hidden_size": null,
            "add_cross_attention": false,
            "tie_encoder_decoder": false,
            "max_length": 448,
            "min_length": 0,
            "do_sample": false,
            "early_stopping": false,
            "num_beams": 1,
            "num_beam_groups": 1,
            "diversity_penalty": 0.0,
            "temperature": 1.0,
            "top_k": 50,
            "top_p": 1.0,
            "typical_p": 1.0,
            "repetition_penalty": 1.0,
            "length_penalty": 1.0,
            "no_repeat_ngram_size": 0,
            "encoder_no_repeat_ngram_size": 0,
            "bad_words_ids": null,
            "num_return_sequences": 1,
            "output_scores": false,
            "return_dict_in_generate": false,
            "forced_bos_token_id": null,
            "forced_eos_token_id": null,
            "remove_invalid_values": false,
            "exponential_decay_length_penalty": null,
            "begin_suppress_tokens": [
                220,
                50257
            ],
            "architectures": [
                "WhisperForConditionalGeneration"
            ],
            "finetuning_task": null,
            "id2label": {
                "0": "LABEL_0",
                "1": "LABEL_1"
            },
            "label2id": {
                "LABEL_0": 0,
                "LABEL_1": 1
            },
            "tokenizer_class": null,
            "prefix": null,
            "bos_token_id": 50257,
            "pad_token_id": 50257,
            "eos_token_id": 50257,
            "sep_token_id": null,
            "decoder_start_token_id": 50258,
            "task_specific_params": null,
            "problem_type": null,
            "_name_or_path": "openai/whisper-medium",
            "transformers_version": "4.42.4",
            "forced_decoder_ids": [
                [
                    1,
                    50259
                ],
                [
                    2,
                    50359
                ],
                [
                    3,
                    50363
                ]
            ],
            "model_type": "whisper"
        }
    },
    "systems": {
        "local_system": {
            "type": "LocalSystem",
            "accelerators": [
                {
                    "device": "gpu",
                    "execution_providers": [
                        "CUDAExecutionProvider"
                    ]
                }
            ]
        }
    },
    "data_configs": [
        {
            "name": "latency_data_config",
            "user_script": "code/user_script.py",
            "script_dir": "code",
            "load_dataset_config": {
                "type": "whisper_dataset",
                "data_dir": "data",
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "dataloader_config": {
                "type": "no_auto_batch_dataloader"
            }
        }
    ],
    "evaluators": {
        "common_evaluator": {
            "metrics": [
                {
                    "name": "latency",
                    "type": "latency",
                    "sub_types": [
                        {
                            "name": "avg",
                            "priority": 1
                        }
                    ],
                    "data_config": "latency_data_config"
                }
            ]
        }
    },
    "passes": {
        "conversion": {
            "type": "OnnxConversion",
            "target_opset": 17
        },
        "transformers_optimization": {
            "type": "OrtTransformersOptimization",
            "optimization_options": {
                "use_multi_head_attention": true
            },
            "use_gpu": true
        },
        "onnx_dynamic_quantization": {
            "type": "OnnxDynamicQuantization",
            "per_channel": false,
            "reduce_range": false,
            "op_types_to_quantize": [
                "MatMul",
                "Gemm",
                "Gather"
            ],
            "MatMulConstBOnly": false
        },
        "insert_beam_search": {
            "type": "InsertBeamSearch",
            "use_forced_decoder_ids": true,
            "use_logits_processor": false,
            "fp16": false
        },
        "prepost": {
            "type": "AppendPrePostProcessingOps",
            "tool_command": "whisper",
            "tool_command_args": {
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "target_opset": 17
        }
    },
    "log_severity_level": 0,
    "host": "local_system",
    "target": "local_system",
    "evaluator": "common_evaluator",
    "evaluate_input_model": false,
    "clean_cache": false,
    "cache_dir": "cache",
    "output_dir": "models",
    "output_name": "whisper_gpu_int8"
}

Olive logs
Add logs here.

Other information

OS: maOS
Olive version: 0.4.0
ONNXRuntime package and version: onnxruntime==1.18.1

Additional context
Related: #1213
Optimized on ubuntu server with cuda

jambayk · 2024-08-12T16:54:05Z

The whisper example has only been optimized and tested for cpu and cuda ep. It uses multiple onnxruntime custom operators that have not been implemented on other EPs.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Missing implementation error for CoreML #1299

Missing implementation error for CoreML #1299

thewh1teagle commented Aug 10, 2024 •

edited

Loading

jambayk commented Aug 12, 2024

Missing implementation error for CoreML #1299

Missing implementation error for CoreML #1299

Comments

thewh1teagle commented Aug 10, 2024 • edited Loading

jambayk commented Aug 12, 2024

thewh1teagle commented Aug 10, 2024 •

edited

Loading