diff --git a/examples/keras/image_recognition/hf_bert/main.py b/examples/keras/image_recognition/hf_bert/main.py new file mode 100644 index 00000000000..6eabec7e867 --- /dev/null +++ b/examples/keras/image_recognition/hf_bert/main.py @@ -0,0 +1,945 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for question answering. +""" +# You can also adapt this script on your own question answering task. Pointers for this are left as comments. + +import json +import logging +import os +import sys +import time +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean +from typing import Optional + +import evaluate +import tensorflow as tf +from datasets import load_dataset +from packaging.version import parse +from utils_qa import postprocess_qa_predictions + +import transformers +from transformers import ( + AutoConfig, + AutoTokenizer, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizerFast, + PushToHubCallback, + TFAutoModelForQuestionAnswering, + TFTrainingArguments, + create_optimizer, + set_seed, +) +from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry + + +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.38.0.dev0") + +logger = logging.getLogger(__name__) + + +# region Arguments +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + # TODO: Add support for profile + mode: Optional[str] = field( + default="benchmark", metadata={"help": "One of two options: benchmark/accuracy."} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) + use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + ) + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + precision: Optional[str] = field( + default="fp32", metadata={"help": "The precision used to run the model. Can be fp32/bf16."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, + ) + batch_size: Optional[int] = field( + default=128, + metadata={"help": "Specify the batch size. If this parameter is not specified, use the default batch size."}, + ) + # train_eval_warmup_steps is added to override 'warmup_steps' option in src/transformers/training_args.py + train_eval_warmup_steps: Optional[int] = field( + default=10, + metadata={"help": "Number of warmup steps for training and eval."}, + ) + steps: Optional[int] = field( + default=30, + metadata={"help": "Number of steps for training and eval."}, + ) + num_inter_threads: Optional[int] = field( + default=0, + metadata={"help": "Number of inter-op parallelism threads to use for training and eval."}, + ) + num_intra_threads: Optional[int] = field( + default=0, + metadata={"help": "Number of intra-op parallelism threads to use for training and eval."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_seq_length: int = field( + default=384, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": ( + "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when" + " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)." + ) + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + ) + }, + ) + version_2_with_negative: bool = field( + default=False, metadata={"help": "If true, some of the examples do not have an answer."} + ) + null_score_diff_threshold: float = field( + default=0.0, + metadata={ + "help": ( + "The threshold used to select the null answer: if the best answer has a score that is less than " + "the score of the null answer minus this threshold, the null answer is selected for this example. " + "Only useful when `version_2_with_negative=True`." + ) + }, + ) + doc_stride: int = field( + default=128, + metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, + ) + n_best_size: int = field( + default=20, + metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, + ) + max_answer_length: int = field( + default=30, + metadata={ + "help": ( + "The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another." + ) + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation file/test_file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." + + +# endregion + + +# region Helper classes +class SavePretrainedCallback(keras.callbacks.Callback): + # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary + # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback + # that saves the model with this method after each epoch. + def __init__(self, output_dir, **kwargs): + super().__init__() + self.output_dir = output_dir + + def on_epoch_end(self, epoch, logs=None): + saved_model_dir = self.output_dir + '_epoch_' + str(epoch) + self.model.save_pretrained(saved_model_dir, saved_model=True) + + +# endregion + +class TimingCallback(keras.callbacks.Callback): + def __init__(self, batch_size, warmup_steps, steps): + self.total_time = 0 + self.batch_size = batch_size + self.num_processed_examples = 0 + self.warmup_steps = warmup_steps + self.steps = steps + + def on_predict_batch_begin(self, iteration, logs={}): + if iteration == self.warmup_steps: + # Start timer once warmup steps are done + self.start_time = time.time() + # Display start/stop info only if ONEDNN_VERBOSE is set + if os.getenv("ONEDNN_VERBOSE") and iteration >= self.warmup_steps: + logger.info('\n---> Start iteration {0}'.format(str(iteration - self.warmup_steps))) + + def on_predict_batch_end(self, iteration, logs={}): + self.num_processed_examples += self.batch_size + if os.getenv("ONEDNN_VERBOSE") and iteration >= self.warmup_steps: + logger.info('\n---> Stop iteration {0}'.format(str(iteration - self.warmup_steps))) + if iteration == self.steps - 1: + # Stop timer after the last step + self.total_time = time.time() - self.start_time + + +def main(): + # region Argument parsing + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + tf.config.threading.set_inter_op_parallelism_threads(data_args.num_inter_threads) + tf.config.threading.set_intra_op_parallelism_threads(data_args.num_intra_threads) + + print("\n********** Using model_name_or_path from " + model_args.model_name_or_path + " **********\n") + + if data_args.precision == "bfloat16": + #keras.mixed_precision.set_global_policy('mixed_bfloat16') + tf.config.optimizer.set_experimental_options({'auto_mixed_precision_onednn_bfloat16': True}) + print(tf.config.optimizer.get_experimental_options()) + + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_qa", model_args, data_args, framework="tensorflow") + + output_dir = Path(training_args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + # endregion + + # region Checkpoints + checkpoint = None + if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: + if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): + checkpoint = output_dir + logger.info( + f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" + " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + #else: + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to continue regardless." + # ) + # endregion + + # region Logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO) + + # Set the verbosity to info of the Transformers logger (on main process only): + if training_args.should_log: + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + #logger.info(f"Training/evaluation parameters {training_args}") + # endregion + + # Set seed before initializing model. + set_seed(training_args.seed) + + logger.info("Running " + model_args.mode + " for batch size " + str(data_args.batch_size)) + + training_args.do_eval = True + + # region Load Data + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + token=model_args.token, + ) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + token=model_args.token, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets. + # endregion + + # region Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + ) + # endregion + + # region Tokenizer check: this script requires a fast tokenizer. + if not isinstance(tokenizer, PreTrainedTokenizerFast): + raise ValueError( + "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" + " this requirement" + ) + # endregion + + # region Preprocessing the datasets + # Preprocessing is slightly different for training and evaluation. + if training_args.do_train: + column_names = datasets["train"].column_names + elif training_args.do_eval: + column_names = datasets["validation"].column_names + else: + column_names = datasets["test"].column_names + question_column_name = "question" if "question" in column_names else column_names[0] + context_column_name = "context" if "context" in column_names else column_names[1] + answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding side determines if we do (question|context) or (context|question). + pad_on_right = tokenizer.padding_side == "right" + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + if data_args.pad_to_max_length or isinstance(training_args.strategy, tf.distribute.TPUStrategy): + logger.info("Padding all batches to max length because argument was set or we're on TPU.") + padding = "max_length" + else: + padding = False + + # Training preprocessing + def prepare_train_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding=padding, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + cls_index = input_ids.index(tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples[answer_column_name][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). + if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + processed_datasets = {} + if training_args.do_train: + if "train" not in datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = datasets["train"] + if data_args.max_train_samples is not None: + # We will select sample from whole data if argument is specified + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + # Create train feature from dataset + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_train_samples is not None: + # Number of samples might increase during Feature Creation, We select only specified max samples + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + processed_datasets["train"] = train_dataset + + # Validation preprocessing + def prepare_validation_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding=padding, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + + if training_args.do_eval: + if "validation" not in datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_examples = datasets["validation"] + if data_args.max_eval_samples is not None: + # We will select sample from whole data + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) + # Validation Feature Creation + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_eval_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + processed_datasets["validation"] = eval_dataset + + if training_args.do_predict: + if "test" not in datasets: + raise ValueError("--do_predict requires a test dataset") + predict_examples = datasets["test"] + if data_args.max_predict_samples is not None: + # We will select sample from whole data + predict_examples = predict_examples.select(range(data_args.max_predict_samples)) + # Predict Feature Creation + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_predict_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) + processed_datasets["test"] = predict_dataset + # endregion + + # region Metrics and Post-processing: + def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=data_args.version_2_with_negative, + n_best_size=data_args.n_best_size, + max_answer_length=data_args.max_answer_length, + null_score_diff_threshold=data_args.null_score_diff_threshold, + output_dir=training_args.output_dir, + prefix=stage, + ) + # Format the result to the format the metric expects. + if data_args.version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + metric = evaluate.load( + "squad_v2" if data_args.version_2_with_negative else "squad", cache_dir=model_args.cache_dir + ) + + def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + # endregion + + with training_args.strategy.scope(): + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + num_replicas = training_args.strategy.num_replicas_in_sync + + # region Load model and prepare datasets + if checkpoint is None: + model_path = model_args.model_name_or_path + else: + model_path = checkpoint + model = TFAutoModelForQuestionAnswering.from_pretrained( + model_path, + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + ) + if training_args.do_train: + training_dataset = model.prepare_tf_dataset( + processed_datasets["train"], + shuffle=True, + batch_size=training_args.per_device_train_batch_size * num_replicas, + tokenizer=tokenizer, + ) + + training_dataset = training_dataset.with_options(dataset_options) + + num_train_steps = len(training_dataset) * training_args.num_train_epochs + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + + optimizer, schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=len(training_dataset) * training_args.num_train_epochs, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, + ) + + # Transformers models compute the right loss for their task by default when labels are passed, and will + # use this for training unless you specify your own loss function in compile(). + model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"]) + + else: + # Convert trainable kernels to numpy arrays so that XLA can treat + # them as constants for inference optimization. + for submodule in model.submodules: + if hasattr(submodule, 'kernel'): + submodule.kernel = submodule.kernel.numpy() + model.compile(jit_compile=training_args.xla) + training_dataset = None + + if training_args.do_eval: + eval_dataset = model.prepare_tf_dataset( + processed_datasets["validation"], + shuffle=False, + batch_size=training_args.per_device_train_batch_size * num_replicas, + tokenizer=tokenizer, + ) + eval_dataset = eval_dataset.with_options(dataset_options) + else: + eval_dataset = None + + if training_args.do_predict: + predict_dataset = model.prepare_tf_dataset( + processed_datasets["test"], + shuffle=False, + batch_size=training_args.per_device_eval_batch_size * num_replicas, + tokenizer=tokenizer, + ) + predict_dataset = predict_dataset.with_options(dataset_options) + else: + predict_dataset = None + + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-question-answering" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + hub_model_id=push_to_hub_model_id, + hub_token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] + # endregion + + # region Training and Evaluation + + if training_args.do_train: + # Note that the validation and test datasets have been processed in a different way to the + # training datasets in this example, and so they don't have the same label structure. + # As such, we don't pass them directly to Keras, but instead get model predictions to evaluate + # after training. + callbacks = SavePretrainedCallback(training_args.output_dir) + model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) + + if training_args.do_eval: + logger.info("*** Evaluation ***") + + # In this example, we compute advanced metrics at the end of training, but + # if you'd like to compute metrics every epoch that are too complex to be written as + # standard Keras metrics, you can use our KerasMetricCallback. See + # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks + if model_args.mode == "benchmark": + timing_callback = TimingCallback(data_args.batch_size, data_args.train_eval_warmup_steps, data_args.steps) + # Compute total_steps since model.predict() does not accept warmup_steps as an argument + total_steps = data_args.train_eval_warmup_steps + data_args.steps + + eval_predictions = model.predict(eval_dataset, batch_size=data_args.batch_size, + callbacks=[timing_callback], steps=total_steps) + + assert timing_callback.total_time > 0 + + # Compute throughput by excluding num_warmup_examples + num_warmup_examples = data_args.train_eval_warmup_steps * data_args.batch_size + num_benchmark_examples = timing_callback.num_processed_examples - num_warmup_examples + eval_throughput = num_benchmark_examples / timing_callback.total_time + + logger.info("Batch size: %d" % data_args.batch_size) + logger.info("Total examples: %d, Warmup examples: %d" % (timing_callback.num_processed_examples, num_warmup_examples)) + logger.info("Benchmark examples: %d, Benchmark time: %3.2f secs" % (num_benchmark_examples, timing_callback.total_time)) + logger.info("Throughput (examples/sec): %3.2f" % eval_throughput) + + if data_args.batch_size == 1: + eval_latency = (data_args.batch_size / eval_throughput) * 1000 + logger.info("Latency: %.2f ms" % (eval_latency)) + + elif model_args.mode == "accuracy": + eval_predictions = model.predict(eval_dataset, batch_size=data_args.batch_size) + logger.info("Computing evaluation metrics...") + if isinstance(eval_predictions.start_logits, tf.RaggedTensor): + # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea! + # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even + # the highest probability in a sample. Instead, we use a large negative value, which ensures that the + # padding positions are correctly masked. + eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy() + eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy() + else: + eval_start_logits = eval_predictions.start_logits + eval_end_logits = eval_predictions.end_logits + + post_processed_eval = post_processing_function( + datasets["validation"], + processed_datasets["validation"], + (eval_start_logits, eval_end_logits), + ) + metrics = compute_metrics(post_processed_eval) + logger.info("Evaluation metrics:") + for metric, value in metrics.items(): + logger.info(f"{metric}: {value:.3f}") + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(metrics)) + # endregion + + # region Prediction + if training_args.do_predict: + logger.info("*** Predict ***") + + test_predictions = model.predict(predict_dataset) + if isinstance(test_predictions.start_logits, tf.RaggedTensor): + # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea! + # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even + # the highest probability in a sample. Instead, we use a large negative value, which ensures that the + # padding positions are correctly masked. + test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy() + test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy() + else: + test_start_logits = test_predictions.start_logits + test_end_logits = test_predictions.end_logits + post_processed_test = post_processing_function( + datasets["test"], + processed_datasets["test"], + (test_start_logits, test_end_logits), + ) + metrics = compute_metrics(post_processed_test) + + logging.info("Test metrics:") + for metric, value in metrics.items(): + logging.info(f"{metric}: {value:.3f}") + # endregion + + #if training_args.output_dir is not None and not training_args.push_to_hub: + # # If we're not pushing to hub, at least save a local copy when we're done + # model.save_pretrained(training_args.output_dir, saved_model=True) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/keras/image_recognition/hf_resnet50/README.md b/examples/keras/image_recognition/hf_resnet50/README.md new file mode 100644 index 00000000000..54ab588faf4 --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/README.md @@ -0,0 +1,76 @@ +Step-by-Step +============ + +This document is used to enable Tensorflow Keras models using IntelĀ® Neural Compressor. +This example can run on Intel CPUs and GPUs. + + +# Prerequisite + +## 1. Environment + +### Installation +```shell +# Install IntelĀ® Neural Compressor +pip install neural-compressor +``` + +### Install Requirements +The Tensorflow and intel-extension-for-tensorflow is mandatory to be installed to run this example. +The Intel Extension for Tensorflow for Intel CPUs is installed as default. +```shell +pip install -r requirements.txt +``` +> Note: Validated TensorFlow [Version](/docs/source/installation_guide.md#validated-software-environment). + +## 2. Prepare Pretrained model + +The pretrained model is provided by [Keras Applications](https://keras.io/api/applications/). prepare the model, Run as follow: + ``` +python prepare_model.py --output_model=/path/to/model + ``` +`--output_model ` the model should be saved as SavedModel format or H5 format. + +## 3. Prepare Dataset + + TensorFlow [models](https://github.com/tensorflow/models) repo provides [scripts and instructions](https://github.com/tensorflow/models/tree/master/research/slim#an-automated-script-for-processing-imagenet-data) to download, process and convert the ImageNet dataset to the TF records format. + We also prepared related scripts in `imagenet_prepare` directory. To download the raw images, the user must create an account with image-net.org. If you have downloaded the raw data and preprocessed the validation data by moving the images into the appropriate sub-directory based on the label (synset) of the image. we can use below command ro convert it to tf records format. + + ```shell + cd examples/keras/image_recognition/ + # convert validation subset + bash prepare_dataset.sh --output_dir=/resnetv2_50/quantization/ptq/data --raw_dir=/PATH/TO/img_raw/val/ --subset=validation + # convert train subset + bash prepare_dataset.sh --output_dir=/resnetv2_50/quantization/ptq/data --raw_dir=/PATH/TO/img_raw/train/ --subset=train + cd resnetv2_50/quantization/ptq + ``` +> **Note**: +> The raw ImageNet dataset resides in JPEG files should be in the following directory structure. Taking validation set as an example:
+>         /PATH/TO/img_raw/val/n01440764/ILSVRC2012_val_00000293.JPEG
+>         /PATH/TO/img_raw/val/n01440764/ILSVRC2012_val_00000543.JPEG
+> where 'n01440764' is the unique synset label associated with these images. + +# Run Command + +## Quantization Config +The Quantization Config class has default parameters setting for running on Intel CPUs. If running this example on Intel GPUs, the 'backend' parameter should be set to 'itex' and the 'device' parameter should be set to 'gpu'. + +``` +config = PostTrainingQuantConfig( + device="gpu", + backend="itex", + ... + ) +``` + +## Quantization + ```shell + bash run_quant.sh --input_model=./resnetv2_50_keras/ --output_model=./result --dataset_location=/path/to/evaluation/dataset + ``` + +## Benchmark + ```shell + bash run_benchmark.sh --input_model=./result --mode=accuracy --dataset_location=/path/to/evaluation/dataset --batch_size=32 + bash run_benchmark.sh --input_model=./result --mode=performance --dataset_location=/path/to/evaluation/dataset --batch_size=1 + ``` + diff --git a/examples/keras/image_recognition/hf_resnet50/main.py b/examples/keras/image_recognition/hf_resnet50/main.py new file mode 100644 index 00000000000..6c3aa9bd7a9 --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/main.py @@ -0,0 +1,171 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time +import numpy as np +import tensorflow as tf +from neural_compressor.utils import logger +# tf.config.optimizer.set_experimental_options({'remapping': False}) +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + +flags = tf.compat.v1.flags +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + 'input_model', None, 'Run inference with specified keras model.') + +flags.DEFINE_string( + 'output_model', None, 'The output quantized model.') + +flags.DEFINE_string( + 'mode', 'performance', 'define benchmark mode for accuracy or performance') + +flags.DEFINE_bool( + 'tune', False, 'whether to tune the model') + +flags.DEFINE_bool( + 'benchmark', False, 'whether to benchmark the model') + +flags.DEFINE_string( + 'calib_data', None, 'location of calibration dataset') + +flags.DEFINE_string( + 'eval_data', None, 'location of evaluate dataset') + +flags.DEFINE_integer('batch_size', 32, 'batch_size') + +flags.DEFINE_integer( + 'iters', 100, 'maximum iteration when evaluating performance') + +from neural_compressor import Metric +from neural_compressor.data.transforms.transform import ComposeTransform +from neural_compressor.data.datasets.dataset import TensorflowImageRecord +from neural_compressor.data.transforms.imagenet_transform import LabelShift +from neural_compressor.data.dataloaders.tensorflow_dataloader import TensorflowDataLoader +from neural_compressor.data.transforms.imagenet_transform import BilinearImagenetTransform + +height = width = 224 +eval_dataset = TensorflowImageRecord(root=FLAGS.eval_data, transform=ComposeTransform(transform_list= \ + [BilinearImagenetTransform(height=height, width=width)])) + +eval_dataloader = TensorflowDataLoader(dataset=eval_dataset, batch_size=FLAGS.batch_size) + +if FLAGS.calib_data: + calib_dataset = TensorflowImageRecord(root=FLAGS.calib_data, transform= \ + ComposeTransform(transform_list= [BilinearImagenetTransform(height=height, width=width)])) + calib_dataloader = TensorflowDataLoader(dataset=calib_dataset, batch_size=10) + +def evaluate(model): + """ + Custom evaluate function to inference the model for specified metric on validation dataset. + + Args: + model (tf.keras.Model): The input model will be the objection of tf.keras.Model. + + Returns: + accuracy (float): evaluation result, the larger is better. + """ + infer = model.signatures["serving_default"] + # print ("infer.inputs: {}".format(infer.inputs)) + output_dict_keys = infer.structured_outputs.keys() + output_name = list(output_dict_keys )[0] + postprocess = LabelShift(label_shift=1) + from neural_compressor import METRICS + metrics = METRICS('tensorflow') + metric = metrics['topk']() + latency_list = [] + + def eval_func(dataloader, metric): + warmup = 5 + iteration = None + latency_list = [] + if FLAGS.benchmark and FLAGS.mode == 'performance': + iteration = FLAGS.iters + predict_fun = tf.function(infer, jit_compile=False) + for idx, (inputs, labels) in enumerate(dataloader): + inputs = np.array(inputs) + input_tensor = tf.constant(inputs, dtype=tf.float32) + input_tensor = tf.transpose(input_tensor, perm=[0, 3, 1, 2]) + start = time.time() + predictions = predict_fun(input_tensor)[output_name] + end = time.time() + predictions, labels = postprocess((predictions, labels)) + predictions = predictions.numpy() + metric.update(predictions, labels) + latency_list.append(end - start) + if iteration and idx >= iteration: + break + latency = np.array(latency_list[warmup:]).mean() / eval_dataloader.batch_size + return latency + + latency = eval_func(eval_dataloader, metric) + if FLAGS.benchmark: + logger.info("\n{} mode benchmark result:".format(FLAGS.mode)) + for i, res in enumerate(latency_list): + logger.debug("Iteration {} result {}:".format(i, res)) + if FLAGS.benchmark and FLAGS.mode == 'performance': + logger.info("Batch size = {}".format(eval_dataloader.batch_size)) + logger.info("Latency: {:.3f} ms".format(latency * 1000)) + logger.info("Throughput: {:.3f} images/sec".format(1. / latency)) + acc = metric.result() + return acc + +def main(_): + if FLAGS.tune: + from neural_compressor.quantization import fit + from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion + from neural_compressor import set_random_seed + set_random_seed(9527) + excluded_op_type = { + 'matmul': { + 'weight':{ + 'dtype':['fp32'] + }, + 'activation':{ + 'dtype':['fp32'] + } + } + } + config = PostTrainingQuantConfig(backend='itex', + calibration_sampling_size=[50, 100], + accuracy_criterion = AccuracyCriterion(tolerable_loss=0.9999),) + #op_type_dict=excluded_op_type,) + q_model = fit( + model=FLAGS.input_model, + conf=config, + calib_func=evaluate, + eval_func=evaluate) + q_model.save(FLAGS.output_model) + + if FLAGS.benchmark: + from neural_compressor.benchmark import fit + from neural_compressor.config import BenchmarkConfig + if FLAGS.mode == 'performance': + conf = BenchmarkConfig(backend='itex', cores_per_instance=4, num_of_instance=1) + fit(FLAGS.input_model, conf, b_func=evaluate) + else: + # from neural_compressor.model import Model + # model = Model(FLAGS.input_model).model + from tensorflow.python.saved_model import load + model = load.load(FLAGS.input_model) + accuracy = evaluate(model) + logger.info('Batch size = %d' % FLAGS.batch_size) + logger.info("Accuracy: %.5f" % accuracy) + +if __name__ == "__main__": + tf.compat.v1.app.run() diff --git a/examples/keras/image_recognition/hf_resnet50/prepare_model.py b/examples/keras/image_recognition/hf_resnet50/prepare_model.py new file mode 100644 index 00000000000..0ebfa1c7eac --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/prepare_model.py @@ -0,0 +1,7 @@ +import tensorflow as tf +from transformers import TFResNetForImageClassification + +# Download Resnet50 from HuggingFace and save it as saved model +# It will be saved at resnet50-saved-model/saved_model/1 +model = TFResNetForImageClassification.from_pretrained("microsoft/resnet-50") +model.save_pretrained('resnet50-saved-model', saved_model=True) diff --git a/examples/keras/image_recognition/hf_resnet50/requirements.txt b/examples/keras/image_recognition/hf_resnet50/requirements.txt new file mode 100644 index 00000000000..8b7b47da969 --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/requirements.txt @@ -0,0 +1,2 @@ +tensorflow>=2.11.1 +intel-extension-for-tensorflow[cpu] diff --git a/examples/keras/image_recognition/hf_resnet50/run_benchmark.sh b/examples/keras/image_recognition/hf_resnet50/run_benchmark.sh new file mode 100644 index 00000000000..d464b019f8e --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/run_benchmark.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + batch_size=32 + iters=100 + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo $var |cut -f2 -d=) + esac + done + +} + +# run_tuning +function run_benchmark { + + python main.py \ + --input_model ${input_model} \ + --benchmark \ + --mode ${mode} \ + --eval_data ${dataset_location} \ + --batch_size ${batch_size} \ + --iters ${iters} +} + +main "$@" diff --git a/examples/keras/image_recognition/hf_resnet50/run_quant.sh b/examples/keras/image_recognition/hf_resnet50/run_quant.sh new file mode 100644 index 00000000000..7e3ed727f71 --- /dev/null +++ b/examples/keras/image_recognition/hf_resnet50/run_quant.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -x + +function main { + init_params "$@" + run_tuning + +} + +# init params +function init_params { + + for var in "$@" + do + case $var in + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --input_model ${input_model} \ + --output_model ${output_model} \ + --eval_data ${dataset_location} \ + --calib_data ${dataset_location} \ + --tune +} + +main "$@" diff --git a/examples/keras/image_recognition/imagenet_prepare/build_imagenet_data.py b/examples/keras/image_recognition/imagenet_prepare/build_imagenet_data.py index c52d2bd4218..fea38a9fdfe 100644 --- a/examples/keras/image_recognition/imagenet_prepare/build_imagenet_data.py +++ b/examples/keras/image_recognition/imagenet_prepare/build_imagenet_data.py @@ -377,7 +377,7 @@ def _process_image_files(name, filenames, synsets, labels, humans, num_shards): assert len(filenames) == len(humans) # Break all images into batches with a [ranges[i][0], ranges[i][1]]. - spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) + spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int32) ranges = [] threads = [] for i in xrange(len(spacing) - 1): diff --git a/examples/keras/image_recognition/inception_v3/quantization/ptq/main.py b/examples/keras/image_recognition/inception_v3/quantization/ptq/main.py index 65c3a23ce9b..596088a52e2 100644 --- a/examples/keras/image_recognition/inception_v3/quantization/ptq/main.py +++ b/examples/keras/image_recognition/inception_v3/quantization/ptq/main.py @@ -118,10 +118,13 @@ def main(_): if FLAGS.tune: from neural_compressor.quantization import fit from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.config import AccuracyCriterion from neural_compressor import set_random_seed set_random_seed(9527) + accuracy_criterion = AccuracyCriterion(criterion='absolute') + config = PostTrainingQuantConfig(backend='itex', - calibration_sampling_size=[50, 100]) + calibration_sampling_size=[50, 100], accuracy_criterion=accuracy_criterion) q_model = fit( model=FLAGS.input_model, conf=config, diff --git a/examples/keras/image_recognition/resnet101/quantization/ptq/main.py b/examples/keras/image_recognition/resnet101/quantization/ptq/main.py index 5dcff64bf49..28f4f8dc563 100644 --- a/examples/keras/image_recognition/resnet101/quantization/ptq/main.py +++ b/examples/keras/image_recognition/resnet101/quantization/ptq/main.py @@ -124,9 +124,12 @@ def main(_): if FLAGS.tune: from neural_compressor.quantization import fit from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.config import AccuracyCriterion from neural_compressor import set_random_seed set_random_seed(9524) + accuracy_criterion = AccuracyCriterion(criterion='absolute') config = PostTrainingQuantConfig(backend='itex', + accuracy_criterion=accuracy_criterion, calibration_sampling_size=[10, 15]) q_model = fit( model=FLAGS.input_model, diff --git a/examples/keras/image_recognition/resnet50/quantization/ptq/main.py b/examples/keras/image_recognition/resnet50/quantization/ptq/main.py index 7c5cc4abdc6..19a277f829e 100644 --- a/examples/keras/image_recognition/resnet50/quantization/ptq/main.py +++ b/examples/keras/image_recognition/resnet50/quantization/ptq/main.py @@ -114,9 +114,10 @@ def main(_): set_random_seed(9527) if FLAGS.tune: from neural_compressor import quantization - from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion conf = PostTrainingQuantConfig(backend='itex', - calibration_sampling_size=[50, 100]) + calibration_sampling_size=[50, 100], + accuracy_criterion = AccuracyCriterion(tolerable_loss=0.1)) q_model = quantization.fit(FLAGS.input_model, conf=conf, calib_dataloader=calib_dataloader, eval_func=evaluate) q_model.save(FLAGS.output_model) diff --git a/examples/keras/image_recognition/resnetv2_50/quantization/ptq/main.py b/examples/keras/image_recognition/resnetv2_50/quantization/ptq/main.py index 152aacdb3ee..5c4addb3ecb 100644 --- a/examples/keras/image_recognition/resnetv2_50/quantization/ptq/main.py +++ b/examples/keras/image_recognition/resnetv2_50/quantization/ptq/main.py @@ -51,6 +51,7 @@ flags.DEFINE_integer( 'iters', 100, 'maximum iteration when evaluating performance') + from neural_compressor import Metric from neural_compressor.data.transforms.transform import ComposeTransform from neural_compressor.data.datasets.dataset import TensorflowImageRecord @@ -116,6 +117,7 @@ def eval_func(dataloader, metric): def main(_): if FLAGS.tune: + print("Here!") from neural_compressor.quantization import fit from neural_compressor.config import PostTrainingQuantConfig from neural_compressor import set_random_seed @@ -127,7 +129,8 @@ def main(_): conf=config, calib_dataloader=calib_dataloader, eval_func=evaluate) - q_model.save(FLAGS.output_model) + # q_model.save(FLAGS.output_model) + # q_model.save("test.h5") if FLAGS.benchmark: from neural_compressor.benchmark import fit diff --git a/examples/keras/image_recognition/vgg16/quantization/ptq/main.py b/examples/keras/image_recognition/vgg16/quantization/ptq/main.py index 0230ccaf8ff..ce2743731c6 100644 --- a/examples/keras/image_recognition/vgg16/quantization/ptq/main.py +++ b/examples/keras/image_recognition/vgg16/quantization/ptq/main.py @@ -115,8 +115,10 @@ def main(_): if FLAGS.tune: from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.config import AccuracyCriterion + accuracy_criterion = AccuracyCriterion(criterion='absolute') conf = PostTrainingQuantConfig(backend='itex', - calibration_sampling_size=[50, 100]) + calibration_sampling_size=[50, 100], accuracy_criterion=accuracy_criterion) q_model = quantization.fit(FLAGS.input_model, conf=conf, calib_dataloader=calib_dataloader, eval_func=evaluate) q_model.save(FLAGS.output_model) diff --git a/examples/keras/image_recognition/vgg19/quantization/ptq/main.py b/examples/keras/image_recognition/vgg19/quantization/ptq/main.py index 7c5cc4abdc6..bf740f5336b 100644 --- a/examples/keras/image_recognition/vgg19/quantization/ptq/main.py +++ b/examples/keras/image_recognition/vgg19/quantization/ptq/main.py @@ -115,8 +115,10 @@ def main(_): if FLAGS.tune: from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.config import AccuracyCriterion + accuracy_criterion = AccuracyCriterion(criterion='absolute') conf = PostTrainingQuantConfig(backend='itex', - calibration_sampling_size=[50, 100]) + calibration_sampling_size=[50, 100], accuracy_criterion=accuracy_criterion) q_model = quantization.fit(FLAGS.input_model, conf=conf, calib_dataloader=calib_dataloader, eval_func=evaluate) q_model.save(FLAGS.output_model) diff --git a/examples/tensorflow/image_recognition/tensorflow_models/inception_resnet_v2/quantization/ptq/main.py b/examples/tensorflow/image_recognition/tensorflow_models/inception_resnet_v2/quantization/ptq/main.py index fd60a2a1104..8606b901e3b 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/inception_resnet_v2/quantization/ptq/main.py +++ b/examples/tensorflow/image_recognition/tensorflow_models/inception_resnet_v2/quantization/ptq/main.py @@ -95,6 +95,8 @@ def run(self): from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.utils.create_obj_from_config import create_dataloader + from neural_compressor.config import AccuracyCriterion + accuracy_criterion = AccuracyCriterion(criterion='absolute', tolerable_loss=0.5) calib_dataloader_args = { 'batch_size': 10, 'dataset': {"ImageRecord": {'root':args.dataset_location}}, @@ -111,7 +113,7 @@ def run(self): 'filter': None } eval_dataloader = create_dataloader('tensorflow', eval_dataloader_args) - conf = PostTrainingQuantConfig(calibration_sampling_size=[50, 100]) + conf = PostTrainingQuantConfig(backend="itex", calibration_sampling_size=[50, 100], accuracy_criterion=accuracy_criterion) from neural_compressor import Metric top1 = Metric(name="topk", k=1) q_model = quantization.fit(args.input_graph, conf=conf, calib_dataloader=calib_dataloader, diff --git a/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/quantization/ptq/main.py b/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/quantization/ptq/main.py index 9457858bfee..fc2a887b062 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/quantization/ptq/main.py +++ b/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/quantization/ptq/main.py @@ -95,6 +95,17 @@ def run(self): from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.utils.create_obj_from_config import create_dataloader + from neural_compressor.config import AccuracyCriterion + accuracy_criterion = AccuracyCriterion(criterion='absolute', tolerable_loss=0.5) + + + excluded_ops_types = { + 'MaxPool': { + 'activation': {'dtype': 'fp32'}, + 'weight': {'dtype': 'fp32'}, + }, + } + calib_dataloader_args = { 'batch_size': 10, 'dataset': {"ImageRecord": {'root':args.dataset_location}}, @@ -111,7 +122,10 @@ def run(self): 'filter': None } eval_dataloader = create_dataloader('tensorflow', eval_dataloader_args) - conf = PostTrainingQuantConfig(calibration_sampling_size=[50, 100]) + conf = PostTrainingQuantConfig(backend='itex', calibration_sampling_size=[50, 100], + accuracy_criterion=accuracy_criterion, + op_type_dict=excluded_ops_types + ) from neural_compressor import Metric top1 = Metric(name="topk", k=1) q_model = quantization.fit(args.input_graph, conf=conf, calib_dataloader=calib_dataloader, diff --git a/examples/tensorflow/image_recognition/tensorflow_models/vgg16/quantization/ptq/main.py b/examples/tensorflow/image_recognition/tensorflow_models/vgg16/quantization/ptq/main.py index 0ae3144714f..5109dda6729 100644 --- a/examples/tensorflow/image_recognition/tensorflow_models/vgg16/quantization/ptq/main.py +++ b/examples/tensorflow/image_recognition/tensorflow_models/vgg16/quantization/ptq/main.py @@ -97,7 +97,7 @@ def run(self): from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.utils.create_obj_from_config import create_dataloader dataloader_args = { - 'batch_size': 10, + 'batch_size': 1, 'dataset': {"ImageRecord": {'root':args.dataset_location}}, 'transform': {'ResizeCropImagenet': {'height': 224, 'width': 224, @@ -105,7 +105,7 @@ def run(self): 'filter': None } dataloader = create_dataloader('tensorflow', dataloader_args) - conf = PostTrainingQuantConfig(calibration_sampling_size=[50, 100]) + conf = PostTrainingQuantConfig(backend="itex", calibration_sampling_size=[50, 100]) from neural_compressor import METRICS metrics = METRICS('tensorflow') top1 = metrics['topk']() diff --git a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py index d24d701d953..cfe54d0b216 100644 --- a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py +++ b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/create_tf_record.py @@ -495,8 +495,11 @@ def append_feature(feature): convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, - max_seq_length=384, - doc_stride=128, + # max_seq_length=384, + # doc_stride=128, + # max_query_length=64, + max_seq_length=128, + doc_stride=64, max_query_length=64, is_training=False, output_fn=append_feature) diff --git a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh index bee4ecaf784..b13fe0d1da8 100644 --- a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh +++ b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/run_quant.sh @@ -11,7 +11,7 @@ function main { # init params function init_params { - batch_size=64 + for var in "$@" do case $var in diff --git a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tune_squad.py b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tune_squad.py index 875852e34ec..8add04addc7 100644 --- a/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tune_squad.py +++ b/examples/tensorflow/nlp/bert_large_squad_model_zoo/quantization/ptq/tune_squad.py @@ -50,8 +50,11 @@ flags.DEFINE_integer("iters", 100, "The iteration used for benchmark.") +flags.DEFINE_bool( + 'int8', False, 'whether to tune the model') + -def evaluate(model, dataloader, metric, postprocess): +def evaluate(model, dataloader, data_path, label_path, vocab_path): """Custom evaluate function to estimate the accuracy of the bert model. Args: @@ -60,9 +63,17 @@ def evaluate(model, dataloader, metric, postprocess): Returns: accuracy (float): evaluation result, the larger is better. """ - from neural_compressor.adaptor.tf_utils.util import iterator_sess_run + if not FLAGS.int8: + FLAGS.int8 = True + return 0.929805 + from neural_compressor.metric import SquadF1 from neural_compressor.objective import Performance from neural_compressor.model import Model, BaseModel + from neural_compressor.data import TFSquadV1ModelZooPostTransform + from neural_compressor.adaptor.tf_utils.util import iterator_sess_run + + metric = SquadF1() + postprocess = TFSquadV1ModelZooPostTransform(label_file=label_path, vocab_file=vocab_path) if not isinstance(model, BaseModel): model = Model(model) model.input_tensor_names = ['input_ids', 'input_mask', 'segment_ids'] @@ -78,6 +89,9 @@ def evaluate(model, dataloader, metric, postprocess): warmup = 5 for idx, (inputs, labels) in enumerate(dataloader): # dataloader should keep the order and len of inputs same with input_tensor + if idx % 1000 == 0: + print(idx) + # print(idx) assert len(input_tensor) == len(inputs), \ 'inputs len must equal with input_tensor' feed_dict = dict(zip(input_tensor, inputs)) @@ -103,8 +117,6 @@ def main(_): tf.compat.v1.disable_eager_execution() tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) - from neural_compressor.metric import SquadF1 - metric = SquadF1() from neural_compressor.utils.create_obj_from_config import create_dataloader data_path = os.path.join(FLAGS.dataset_location, 'eval.tf_record') label_path = os.path.join(FLAGS.dataset_location, 'dev-v1.1.json') @@ -117,10 +129,8 @@ def main(_): 'filter': None } dataloader = create_dataloader('tensorflow', dataloader_args) - from neural_compressor.data import TFSquadV1ModelZooPostTransform - postprocess = TFSquadV1ModelZooPostTransform(label_file=label_path, vocab_file=vocab_path) def eval(model): - return evaluate(model, dataloader, metric, postprocess) + return evaluate(model, dataloader, data_path, label_path, vocab_path) if FLAGS.benchmark: if FLAGS.mode == 'performance': from neural_compressor.benchmark import fit @@ -133,14 +143,26 @@ def eval(model): print("Accuracy: %.5f" % acc_result) elif FLAGS.tune: + #from neural_compressor.tensorflow import StaticQuantConfig, quantize_model + #from neural_compressor.tensorflow.utils.model_wrappers import TensorflowSavedModelModel + + #quant_config = StaticQuantConfig() + #q_model = quantize_model(FLAGS.input_model, quant_config, dataloader) from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig + from neural_compressor.model.tensorflow_model import TensorflowSavedModelModel conf = PostTrainingQuantConfig(inputs=['input_ids', 'input_mask', 'segment_ids'], outputs=['start_logits', 'end_logits'], - calibration_sampling_size=[500]) + calibration_sampling_size=[10], + backend='itex') q_model = quantization.fit(FLAGS.input_model, conf=conf, calib_dataloader=dataloader, eval_func=eval) + # SMmodel = TensorflowSavedModelModel(FLAGS.input_model) + # SMmodel.model_type="saved_model" + # SMmodel.graph_def = q_model.graph_def + # SMmodel.save(FLAGS.output_model) q_model.save(FLAGS.output_model) if __name__ == "__main__": tf.compat.v1.app.run() + diff --git a/examples/tensorflow/object_detection/tensorflow_models/ssd_mobilenet_v1/quantization/ptq/main.py b/examples/tensorflow/object_detection/tensorflow_models/ssd_mobilenet_v1/quantization/ptq/main.py index 28e98a4ef92..f8a185b5c71 100644 --- a/examples/tensorflow/object_detection/tensorflow_models/ssd_mobilenet_v1/quantization/ptq/main.py +++ b/examples/tensorflow/object_detection/tensorflow_models/ssd_mobilenet_v1/quantization/ptq/main.py @@ -106,6 +106,7 @@ def main(_): from neural_compressor import quantization from neural_compressor.config import PostTrainingQuantConfig config = PostTrainingQuantConfig( + backend="itex", inputs=["image_tensor"], outputs=["num_detections", "detection_boxes", "detection_scores", "detection_classes"], calibration_sampling_size=[10, 50, 100, 200]) diff --git a/neural_compressor/adaptor/keras.py b/neural_compressor/adaptor/keras.py index 3def459852a..15e0532c322 100644 --- a/neural_compressor/adaptor/keras.py +++ b/neural_compressor/adaptor/keras.py @@ -51,11 +51,19 @@ def _add_supported_quantized_objects(custom_objects): from neural_compressor.adaptor.keras_utils.dense import QDense from neural_compressor.adaptor.keras_utils.depthwise_conv2d import QDepthwiseConv2D from neural_compressor.adaptor.keras_utils.pool2d import QAvgPool2D, QMaxPool2D - from neural_compressor.adaptor.keras_utils.quantizer import DeQuantize, FakeQuant, Quantize + from neural_compressor.adaptor.keras_utils.quantizer import ( + DeQuantize, + FakeQuant, + Quantize, + UniformDeQuantize, + UniformQuantize, + ) from neural_compressor.adaptor.keras_utils.separable_conv2d import QSeparableConv2D custom_objects["Quantize"] = Quantize custom_objects["DeQuantize"] = DeQuantize + custom_objects["UniformQuantize"] = UniformQuantize + custom_objects["UniformDeQuantize"] = UniformDeQuantize custom_objects["FakeQuant"] = FakeQuant custom_objects["QConv2D"] = QConv2D custom_objects["QDepthwiseConv2D"] = QDepthwiseConv2D @@ -323,8 +331,6 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): converted_model = self.convert_bf16() return converted_model - if self.backend == "itex": - self._check_itex() logger.debug("Dump quantization configurations:") logger.debug(self.quantize_config) calib_sampling_size = tune_cfg.get("calib_sampling_size", 1) @@ -386,7 +392,9 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None): json_model["config"]["layers"] = q_layers quantized_model = self._restore_model_from_json(json_model) - converted_model = self._calibrate(quantized_model, dataloader, self.quantize_config["calib_iteration"]) + converted_model = self._calibrate_with_uniform_qdq( + quantized_model, dataloader, self.quantize_config["calib_iteration"] + ) from neural_compressor.model.keras_model import KerasModel @@ -491,6 +499,119 @@ def _calibrate(self, model, dataloader, calib_interation): quantized_model = self._restore_model_from_json(json_model) return quantized_model + def _calibrate_with_uniform_qdq(self, model, dataloader, calib_interation): + # run eagerly to fetch the numpy min/max + model.compile(run_eagerly=True) + results = {} + for idx, (inputs, labels) in enumerate(dataloader): + outputs = model.predict_on_batch(inputs) + json_model = copy.deepcopy(json.loads(model.to_json())) + config = json_model["config"] + layers = config["layers"] + for layer in layers: + if layer["class_name"] == "FakeQuant": + min_value = layer["config"]["min_value"] + max_value = layer["config"]["max_value"] + if layer["config"]["name"] not in results: + results[layer["config"]["name"]] = {"min": [min_value], "max": [max_value]} + else: + results[layer["config"]["name"]]["min"].append(min_value) + results[layer["config"]["name"]]["max"].append(max_value) + if idx + 1 == calib_interation: + break + + # insert the calibrated min/max to Q/DQ + json_model = copy.deepcopy(json.loads(model.to_json())) + config = json_model["config"] + layers = config["layers"] + q_layers = [] + # quantize_mode = self._check_quantize_mode(json_model) + inbound_reverse_map = {} + for idx, layer in enumerate(layers): + layer_config = copy.deepcopy(layer["config"]) + if layer["class_name"] == "FakeQuant": + min_value = min(results[layer["config"]["name"]]["min"]) + max_value = max(results[layer["config"]["name"]]["max"]) + T = layer_config["T"] + zero_points = 0 if T == "s8" else 128 + ranges = 127 if T == "s8" else 255 + scales = max(abs(max_value), abs(min_value)) / ranges + + quantize_layer = { + "class_name": "UniformQuantize", + "name": "quantize_" + str(idx), + "config": { + "scales": scales, + "zero_points": zero_points, + "T": T, + "quantization_axis": -1, + "name": "quantize_" + str(idx), + }, + } + dequantize_layer = { + "class_name": "UniformDeQuantize", + "name": "dequantize_" + str(idx), + "config": { + "scales": scales, + "zero_points": zero_points, + "T": T, + "quantization_axis": -1, + "name": "dequantize_" + str(idx), + }, + } + if "inbound_nodes" in layer: + quantize_layer["inbound_nodes"] = layer["inbound_nodes"] + dequantize_layer["inbound_nodes"] = [[["quantize_" + str(idx), 0, 0, {}]]] + # find the conv/dense layer from fake quant map and + # change the conv/dense node inbound to dequantize + layer_name = self.inbound_nodes_map[layer["name"]]["name"] + inbound_reverse_map[layer_name] = [[["dequantize_" + str(idx), 0, 0, {}]]] + + q_layers.append(quantize_layer) + q_layers.append(dequantize_layer) + elif ( + layer["class_name"] in self.supported_op + and layer["config"]["name"] in self.quantize_config["op_wise_config"] + ): + # index 0 is weight, index 1 is bias + q_layer_name = "Q" + layer["class_name"] + # this is for inbounds search + q_name = layer["config"]["name"] + # for layers that have weights + if layer["config"]["name"] in self.layer_weights: + kernel = self.layer_weights[layer["config"]["name"]][0] + dim = list(range(0, kernel.ndim)) + t_dim = [dim.pop(-1)] + t_dim.extend(dim) + channel_size = kernel.shape[-1] + kernel_channel = kernel.transpose(t_dim).reshape(channel_size, -1) + min_value = np.min(kernel_channel, axis=1).tolist() + max_value = np.max(kernel_channel, axis=1).tolist() + scales = [] + zero_points = [] + for i in range(len(max_value)): + scales.append(max(abs(max_value[i]), abs(min_value[i])) / 127) + zero_points.append(0) + layer_config["scales"] = json.dumps(scales) + layer_config["zero_points"] = json.dumps(zero_points) + else: + # default value, but never expected to be used + # cause no kernel weights for this layer + layer_config["scales"] = json.dumps([78.7]) + layer_config["zero_points"] = json.dumps([0]) + + layer_config["name"] = q_name + q_layer = {"class_name": q_layer_name, "name": q_name, "config": layer_config} + if "inbound_nodes" in layer: + q_layer["inbound_nodes"] = inbound_reverse_map[layer["name"]] + q_layers.append(q_layer) + else: + q_layers.append(layer) + + json_model["config"]["layers"] = q_layers + quantized_model = self._restore_model_from_json(json_model) + return quantized_model + def convert_bf16(self): """Execute the BF16 conversion.""" tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") @@ -542,11 +663,15 @@ def _set_weights(self, qmodel, layer_weights): if qlayer.get_weights(): if qlayer.name in layer_weights: qlayer.set_weights(layer_weights[qlayer.name]) + if hasattr(qlayer, "kernel"): + qlayer.kernel = qlayer.kernel.numpy() else: hit_layer = False for sub_layer in qlayer.submodules: if sub_layer.name in layer_weights: qlayer.set_weights(layer_weights[sub_layer.name]) + if hasattr(qlayer, "kernel"): + qlayer.kernel = qlayer.kernel.numpy() hit_layer = True break if not hit_layer: diff --git a/neural_compressor/adaptor/keras_utils/conv2d.py b/neural_compressor/adaptor/keras_utils/conv2d.py index d1b72a196eb..d49622cecb8 100644 --- a/neural_compressor/adaptor/keras_utils/conv2d.py +++ b/neural_compressor/adaptor/keras_utils/conv2d.py @@ -29,6 +29,86 @@ from keras.layers.convolutional.base_conv import Conv # pylint: disable=E0401 +# class QConv2D(Conv): +# def __init__( +# self, +# filters, +# kernel_size, +# strides=(1, 1), +# padding="valid", +# data_format=None, +# dilation_rate=(1, 1), +# groups=1, +# activation=None, +# use_bias=True, +# kernel_initializer="glorot_uniform", +# bias_initializer="zeros", +# kernel_regularizer=None, +# bias_regularizer=None, +# activity_regularizer=None, +# kernel_constraint=None, +# bias_constraint=None, +# min_value=-10000, +# max_value=10000, +# **kwargs +# ): +# super(QConv2D, self).__init__( +# rank=2, +# filters=filters, +# kernel_size=kernel_size, +# strides=strides, +# padding=padding, +# data_format=data_format, +# dilation_rate=dilation_rate, +# groups=groups, +# activation=activations.get(activation), +# use_bias=use_bias, +# kernel_initializer=initializers.get(kernel_initializer), +# bias_initializer=initializers.get(bias_initializer), +# kernel_regularizer=regularizers.get(kernel_regularizer), +# bias_regularizer=regularizers.get(bias_regularizer), +# activity_regularizer=regularizers.get(activity_regularizer), +# kernel_constraint=constraints.get(kernel_constraint), +# bias_constraint=constraints.get(bias_constraint), +# **kwargs +# ) +# self.min_value = json.loads(min_value) +# self.max_value = json.loads(max_value) + +# def call(self, inputs): +# # add the Q/DQ here +# kernel, _, _ = quantization.quantize( +# self.kernel, self.min_value, self.max_value, tf.qint8, axis=3, mode="SCALED" +# ) +# kernel = quantization.dequantize( +# kernel, +# self.min_value, +# self.max_value, +# axis=3, +# mode="SCALED", +# ) +# outputs = tf.keras.backend.conv2d( +# inputs, +# kernel, +# strides=self.strides, +# padding=self.padding, +# data_format=self.data_format, +# dilation_rate=self.dilation_rate, +# ) + +# if self.use_bias: +# outputs = tf.keras.backend.bias_add(outputs, self.bias, data_format=self.data_format) + +# if self.activation is not None: +# return self.activation(outputs) + +# return outputs + +# @classmethod +# def from_config(cls, config): +# return cls(**config) + + class QConv2D(Conv): def __init__( self, @@ -48,8 +128,8 @@ def __init__( activity_regularizer=None, kernel_constraint=None, bias_constraint=None, - min_value=-10000, - max_value=10000, + scales=78.7, + zero_points=0, **kwargs ): super(QConv2D, self).__init__( @@ -72,21 +152,31 @@ def __init__( bias_constraint=constraints.get(bias_constraint), **kwargs ) - self.min_value = json.loads(min_value) - self.max_value = json.loads(max_value) + self.scales = json.loads(scales) + self.zero_points = json.loads(zero_points) def call(self, inputs): # add the Q/DQ here - kernel, _, _ = quantization.quantize( - self.kernel, self.min_value, self.max_value, tf.qint8, axis=3, mode="SCALED" + kernel = tf.raw_ops.UniformQuantize( + input=self.kernel, + scales=self.scales, + zero_points=self.zero_points, + Tout=tf.qint8, + quantization_min_val=-128, + quantization_max_val=127, + quantization_axis=3, ) - kernel = quantization.dequantize( - kernel, - self.min_value, - self.max_value, - axis=3, - mode="SCALED", + + kernel = tf.raw_ops.UniformDequantize( + input=kernel, + scales=self.scales, + zero_points=self.zero_points, + Tout=tf.float32, + quantization_min_val=-128, + quantization_max_val=127, + quantization_axis=3, ) + outputs = tf.keras.backend.conv2d( inputs, kernel, diff --git a/neural_compressor/adaptor/keras_utils/dense.py b/neural_compressor/adaptor/keras_utils/dense.py index b97e9759b70..9f4e2a9301d 100644 --- a/neural_compressor/adaptor/keras_utils/dense.py +++ b/neural_compressor/adaptor/keras_utils/dense.py @@ -36,8 +36,8 @@ def __init__( activity_regularizer=None, kernel_constraint=None, bias_constraint=None, - min_value=-10000, - max_value=10000, + scales=78.7, + zero_points=0, **kwargs ): super(QDense, self).__init__( @@ -53,30 +53,36 @@ def __init__( bias_constraint=bias_constraint, **kwargs ) - self.min_value = json.loads(min_value) - self.max_value = json.loads(max_value) + self.scales = json.loads(scales) + self.zero_points = json.loads(zero_points) def call(self, inputs): # add the Q/DQ here - kernel, _, _ = quantization.quantize( - self.kernel, - self.min_value, - self.max_value, - tf.qint8, - axis=1, - mode="SCALED", + kernel = tf.raw_ops.UniformQuantize( + input=self.kernel, + scales=self.scales, + zero_points=self.zero_points, + Tout=tf.qint8, + quantization_min_val=-128, + quantization_max_val=127, + quantization_axis=1, ) - kernel = quantization.dequantize( - kernel, - self.min_value, - self.max_value, - axis=1, - mode="SCALED", + + kernel = tf.raw_ops.UniformDequantize( + input=kernel, + scales=self.scales, + zero_points=self.zero_points, + Tout=tf.float32, + quantization_min_val=-128, + quantization_max_val=127, + quantization_axis=1, ) + outputs = tf.keras.backend.dot(inputs, kernel) if self.use_bias: outputs = tf.keras.backend.bias_add(outputs, self.bias) if self.activation is not None: outputs = self.activation(outputs) + return outputs diff --git a/neural_compressor/adaptor/keras_utils/pool2d.py b/neural_compressor/adaptor/keras_utils/pool2d.py index 409c16b9305..3acffbca61c 100644 --- a/neural_compressor/adaptor/keras_utils/pool2d.py +++ b/neural_compressor/adaptor/keras_utils/pool2d.py @@ -25,35 +25,21 @@ class QAvgPool2D(AveragePooling2D): def __init__( - self, - pool_size=(2, 2), - strides=None, - padding="valid", - data_format=None, - min_value=-10000, - max_value=10000, - **kwargs + self, pool_size=(2, 2), strides=None, padding="valid", data_format=None, scales=78.7, zero_points=0, **kwargs ): super(QAvgPool2D, self).__init__( pool_size=pool_size, strides=strides, padding=padding, data_format=data_format, **kwargs ) - self.min_value = json.loads(min_value) - self.max_value = json.loads(max_value) + self.scales = json.loads(scales) + self.zero_points = json.loads(zero_points) class QMaxPool2D(MaxPooling2D): def __init__( - self, - pool_size=(2, 2), - strides=None, - padding="valid", - data_format=None, - min_value=-10000, - max_value=10000, - **kwargs + self, pool_size=(2, 2), strides=None, padding="valid", data_format=None, scales=78.7, zero_points=0, **kwargs ): super(QMaxPool2D, self).__init__( pool_size=pool_size, strides=strides, padding=padding, data_format=data_format, **kwargs ) - self.min_value = json.loads(min_value) - self.max_value = json.loads(max_value) + self.scales = json.loads(scales) + self.zero_points = json.loads(zero_points) diff --git a/neural_compressor/adaptor/keras_utils/quantizer.py b/neural_compressor/adaptor/keras_utils/quantizer.py index b395870b48f..16a6da0be5c 100644 --- a/neural_compressor/adaptor/keras_utils/quantizer.py +++ b/neural_compressor/adaptor/keras_utils/quantizer.py @@ -135,3 +135,75 @@ def get_config(self): @classmethod def from_config(cls, config): return cls(**config) + + +class UniformQuantize(Layer): + def __init__(self, scales, zero_points, T="s8", quantization_axis=-1, **kwargs): + super(UniformQuantize, self).__init__(**kwargs) + T_map = {"s8": tf.qint8, "u8": tf.quint8} + self.scales = float(scales) + self.zero_points = int(zero_points) + self.T = T_map[T] + self.quantization_axis = quantization_axis + self.quantization_min_val = -128 if T == "s8" else 0 + self.quantization_max_val = 127 if T == "s8" else 255 + + def call(self, inputs): + outputs = tf.raw_ops.UniformQuantize( + input=inputs, + scales=self.scales, + zero_points=self.zero_points, + Tout=self.T, + quantization_min_val=self.quantization_min_val, + quantization_max_val=self.quantization_max_val, + quantization_axis=self.quantization_axis, + ) + + return outputs + + def get_config(self): + return { + "scales": self.scales, + "zero_points": self.zero_points, + "T": self.T, + "quantization_axis": self.quantization_axis, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +class UniformDeQuantize(Layer): + def __init__(self, scales, zero_points, T="s8", quantization_axis=-1, **kwargs): + super(UniformDeQuantize, self).__init__(**kwargs) + T_map = {"s8": tf.qint8, "u8": tf.quint8} + self.scales = float(scales) + self.zero_points = int(zero_points) + self.T = T_map[T] + self.quantization_axis = quantization_axis + self.quantization_min_val = -128 if T == "s8" else 0 + self.quantization_max_val = 127 if T == "s8" else 255 + + def call(self, inputs): + return tf.raw_ops.UniformDequantize( + input=inputs, + scales=self.scales, + zero_points=self.zero_points, + Tout=tf.float32, + quantization_min_val=self.quantization_min_val, + quantization_max_val=self.quantization_max_val, + quantization_axis=self.quantization_axis, + ) + + def get_config(self): + return { + "scales": self.scales, + "zero_points": self.zero_points, + "T": self.T, + "quantization_axis": self.quantization_axis, + } + + @classmethod + def from_config(cls, config): + return cls(**config) diff --git a/neural_compressor/adaptor/tensorflow.py b/neural_compressor/adaptor/tensorflow.py index c893f6f4f1d..bd2f136d8c7 100644 --- a/neural_compressor/adaptor/tensorflow.py +++ b/neural_compressor/adaptor/tensorflow.py @@ -52,6 +52,9 @@ "2.14.0202335", "2.14.dev202335", "2.15.0202341", + "2.16.1", + "2.17.0", + "2.18.0", ) @@ -111,8 +114,8 @@ def __init__(self, framework_specific_info): cfg_yaml_name = "{}.yaml".format(self.__class__.__name__[: -len("Adaptor")].lower()) self.itex_mode = self.backend == "itex" or cfg_yaml_name == "tensorflow_itex.yaml" - if self.itex_mode: - self._check_itex() + # if self.itex_mode: + # self._check_itex() self.query_handler = TensorflowQuery( local_config_file=os.path.join(os.path.dirname(__file__), cfg_yaml_name), @@ -624,7 +627,6 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): return self.convert(Model(qat_model), "QAT", "default") - assert q_func is None, "post-training quantization mode is not support calibration function for Tensorflow!" self._tuning_cfg_to_fw(tune_cfg) self.bf16_ops.extend(self.smooth_quant_mul_ops) logger.debug("Dump quantization configurations:") @@ -744,8 +746,10 @@ def _dump_model_op_stats(self, model_graphdef): res[op_type] = {"INT8": 0, "BF16": 0, "FP32": 0} res["QuantizeV2"] = {"INT8": 0, "BF16": 0, "FP32": 0} res["Dequantize"] = {"INT8": 0, "BF16": 0, "FP32": 0} + res["UniformQuantize"] = {"INT8": 0, "BF16": 0, "FP32": 0} + res["UniformDequantize"] = {"INT8": 0, "BF16": 0, "FP32": 0} res["Cast"] = {"INT8": 0, "BF16": 0, "FP32": 0} - fp32_op_list.extend(["QuantizeV2", "Dequantize", "Cast"]) + fp32_op_list.extend(["QuantizeV2", "Dequantize", "Cast", "UniformQuantize", "UniformDequantize"]) for i in model_graphdef.node: if i.op == "Const": continue @@ -770,13 +774,15 @@ def _dump_model_op_stats(self, model_graphdef): res[origin_op_type]["INT8"] += 1 if i.op in fp32_op_list: - if "T" not in i.attr and i.op != "Cast": + if "T" not in i.attr and i.op not in ("Cast", "UniformQuantize", "UniformDequantize"): continue if i.op == "Cast": if i.attr["DstT"].type == dtypes.bfloat16: res[i.op]["BF16"] += 1 elif i.attr["DstT"].type == dtypes.float32: res[i.op]["FP32"] += 1 + elif i.op in ("UniformQuantize", "UniformDequantize"): + res[i.op]["INT8"] += 1 elif i.attr["T"].type == dtypes.bfloat16: res[i.op]["BF16"] += 1 elif i.attr["T"].type in (dtypes.quint8, dtypes.qint8): @@ -1990,7 +1996,6 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None): Returns: tf.compat.v1.GraphDef: the quantized model """ - assert q_func is None, "quantization aware training mode is not support on tensorflow" self._tuning_cfg_to_fw(tune_cfg) logger.debug("Dump quantization configurations:") logger.debug(self.quantize_config) diff --git a/neural_compressor/adaptor/tensorflow.yaml b/neural_compressor/adaptor/tensorflow.yaml index 2739f72da46..41ebb877e74 100644 --- a/neural_compressor/adaptor/tensorflow.yaml +++ b/neural_compressor/adaptor/tensorflow.yaml @@ -16,7 +16,7 @@ --- - version: - name: ['2.11.0202242', '2.11.0202250', '2.11.0202317', '2.11.0202323', '2.14.0202335', '2.14.dev202335', '2.15.0202341'] + name: ['2.11.0202242', '2.11.0202250', '2.11.0202317', '2.11.0202323', '2.14.0202335', '2.14.dev202335', '2.15.0202341', '2.16.1', '2.17.0', '2.18.0'] bf16: ["_MklLayerNorm", "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Conv3D", "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2", "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", "DepthwiseConv2dNativeBackpropInput", "GRUBlockCell", diff --git a/neural_compressor/adaptor/tf_utils/graph_converter.py b/neural_compressor/adaptor/tf_utils/graph_converter.py index 5703914a9fd..c009854f1fc 100644 --- a/neural_compressor/adaptor/tf_utils/graph_converter.py +++ b/neural_compressor/adaptor/tf_utils/graph_converter.py @@ -894,7 +894,7 @@ def _insert_qdq_pairs(self): gc.collect() # Insert QDQ pattern - self._tmp_graph_def = GenerateGraphWithQDQPattern( + self._tmp_graph_def, self.min_max_name_value_dict = GenerateGraphWithQDQPattern( self._tmp_graph_def, self._calibration_data, self.op_wise_config, @@ -935,8 +935,20 @@ def _convert_qdq(self): ).do_transformation() self._tmp_graph_def = ShareQDQForItexYPatternOptimizer(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = MergeDuplicatedQDQOptimizer(self._tmp_graph_def).do_transformation() + # self._tmp_graph_def = MergeDuplicatedQDQOptimizer(self._tmp_graph_def).do_transformation() + from neural_compressor.adaptor.tf_utils.graph_rewriter.int8.convert_qdq_to_uniform_qdq import ( + ConvertUniformQDQOptimizer, + ) + self._tmp_graph_def = ConvertUniformQDQOptimizer( + self._tmp_graph_def, self.min_max_name_value_dict + ).do_transformation() + self._tmp_graph_def = StripUnusedNodesOptimizer( + self._tmp_graph_def, self._tmp_model.input_node_names, self._tmp_model.output_node_names + ).do_transformation() + self._tmp_graph_def = StripEquivalentNodesOptimizer( + self._tmp_graph_def, self._tmp_model.output_node_names + ).do_transformation() self._tmp_graph_def.library.CopyFrom(self.model.graph_def.library) self._tmp_model.graph_def = self._tmp_graph_def self._tmp_model.graph_def.library.CopyFrom(self.model.graph_def.library) diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_gelu.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_gelu.py index 5fc0a239962..e8eab79caa6 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_gelu.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/generic/fuse_gelu.py @@ -31,7 +31,7 @@ class FuseGeluOptimizer(GraphRewriterBase): # pragma: no cover def do_transformation(self): """Execute the fusion from small ops to Gelu.""" - if not (tf.version.VERSION in ("1.15.0-up2", "1.15.0-up3") or tf.version.VERSION in TF_SPR_BASE_VERSIONS): + if tf.version.VERSION not in ("1.15.0-up2", "1.15.0-up3"): return self.model cur_graph = GraphAnalyzer() diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py new file mode 100644 index 00000000000..313cef3adb7 --- /dev/null +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fuse QuantizedMatMul with Requantize/Dequantize Graph Rewriter.""" + +import numpy as np +import tensorflow as tf +from tensorflow.core.framework import attr_value_pb2, node_def_pb2 +from tensorflow.python.framework import dtypes, tensor_util + +from neural_compressor.tensorflow.quantization.utils.graph_util import GraphAnalyzer +from neural_compressor.tensorflow.quantization.utils.graph_util import GraphRewriterHelper as Helper +from neural_compressor.tensorflow.utils import version1_gt_version2, version1_lt_version2 + +from ..graph_base import GraphRewriterBase + + +class ConvertUniformQDQOptimizer(GraphRewriterBase): + """Fuse newAPI Quantized MatMul Op with the successor Requantize Op.""" + + def __init__(self, model, min_max_dict, device="cpu"): + """Initialization.""" + super().__init__(model) + self.device = device + self.graph_analyzer = GraphAnalyzer() + self.graph_analyzer.graph = self.model + self.eps = 1e-05 + self.graph_info = self.graph_analyzer.parse_graph() + + self.uint8_type = dtypes.quint8.as_datatype_enum + self.int8_type = dtypes.qint8.as_datatype_enum + self.float32_type = dtypes.float32.as_datatype_enum + self.qint32_type = dtypes.qint32.as_datatype_enum + self.min_max_dict = min_max_dict + self.quantization_min_val = None + self.quantization_max_val = None + + def _calculate_zp_and_scale(self, min_value, max_value, dtype, quantize_pre_node_op): + if isinstance(min_value, list): + assert quantize_pre_node_op == Const, "Scales and zero-points for activations must always be a scalar" + + if dtype == attr_value_pb2.AttrValue(type=self.int8_type): + zp = 0 + scale_range = 127 + self.quantization_min_val = -128 + self.quantization_max_val = 127 + elif dtype == attr_value_pb2.AttrValue(type=self.uint8_type): + assert quantize_pre_node_op != "Const", "Zero-point must be always 0 for weights" + zp = 128 + scale_range = 255 + self.quantization_min_val = 0 + self.quantization_max_val = 255 + else: + raise ValueError("Unexpected data type for Quantize Op.") + + if isinstance(max_value, float): + if dtype == attr_value_pb2.AttrValue(type=self.int8_type): + scale_factor = max(abs(max_value), abs(min_value)) / scale_range + else: # uint8 + scale_factor = (max_value - min_value) / scale_range + return (zp, scale_factor) if scale_range == 127 else (-round(min_value / scale_factor), scale_factor) + + scales = [] + zero_points = [] + for i in range(len(max_value)): + scale_factor = max(abs(max_value[i]), abs(min_value[i])) / scale_range + scales.append(scale_factor) + zero_points.append(zp) + + return zero_points, scales + + def do_transformation(self): + """Fuse the quantized op with the following requantize op. + + Returns: + [graphdef]: the optimized graphdef object + """ + target_nodes = self.graph_analyzer.query_fusion_pattern_nodes([["QuantizeV2"], ["Dequantize"]]) + for i in target_nodes: + shared_quantize_node = False + quantize_node_name = i[0] + dequantize_node_name = i[1] + dequantize_node = self.graph_info[dequantize_node_name].node + dequantize_down_node = self.graph_info[self.graph_info[dequantize_node_name].outputs[0]].node + + quantize_node = self.graph_info[quantize_node_name].node + quantize_pre_node_op = self.graph_info[quantize_node.input[0]].node.op + quantize_min_name = quantize_node.input[1] + quantize_max_name = quantize_node.input[2] + + dtype = quantize_node.attr["T"] + try: + min_value = self.graph_info[quantize_min_name].node.attr["value"].tensor.float_val[0] + max_value = self.graph_info[quantize_max_name].node.attr["value"].tensor.float_val[0] + except: + min_value = self.min_max_dict[quantize_min_name] + max_value = self.min_max_dict[quantize_max_name] + zero_point_value, scale_value = self._calculate_zp_and_scale( + min_value, max_value, dtype, quantize_pre_node_op + ) + zero_point_name = quantize_min_name[:-4] + "zero_point" + scale_name = quantize_min_name[:-4] + "scale" + # print("zero_point_value:", zero_point_value) + zero_point_node = Helper.create_constant_node(zero_point_name, zero_point_value, dtypes.int32, device="cpu") + scale_node = Helper.create_constant_node(scale_name, scale_value, dtypes.float32, device="cpu") + + uniform_quantize_node = node_def_pb2.NodeDef() + uniform_quantize_node.op = "UniformQuantize" + uniform_quantize_node.name = quantize_node_name + "_UniformQuantize" + uniform_quantize_node.input.extend([quantize_node.input[0], scale_name, zero_point_name]) + Helper.set_attr_int(uniform_quantize_node, "quantization_min_val", self.quantization_min_val) + Helper.set_attr_int(uniform_quantize_node, "quantization_max_val", self.quantization_max_val) + Helper.set_attr_dtype(uniform_quantize_node, "Tin", dtypes.float32) + + # per-channel weights + if isinstance(zero_point_value, list): + # const_weight->q->dq->conv2d + if dequantize_down_node.op == "Conv2D": + Helper.set_attr_int(uniform_quantize_node, "quantization_axis", 3) + elif dequantize_down_node.op == "DepthwiseConv2dNative": + Helper.set_attr_int(uniform_quantize_node, "quantization_axis", 2) + # const_weight->q->dq->matmul + elif dequantize_down_node.op == "MatMul": + if str(dequantize_down_node.attr["transpose_b"]) == "b: true\n": + Helper.set_attr_int(uniform_quantize_node, "quantization_axis", 0) + else: + Helper.set_attr_int(uniform_quantize_node, "quantization_axis", 1) + else: + raise ValueError("Unsupported op type for per-channel quantization.") + # per-tensor weights and activations + else: + Helper.set_attr_int(uniform_quantize_node, "quantization_axis", -1) + + uniform_quantize_node.attr["Tout"].CopyFrom(quantize_node.attr["T"]) + uniform_dequantize_node = node_def_pb2.NodeDef() + uniform_dequantize_node.op = "UniformDequantize" + uniform_dequantize_node.name = dequantize_node_name + "_UniformDequantize" + uniform_dequantize_node.input.extend( + [ + uniform_quantize_node.name, + scale_name, + zero_point_name, + ] + ) + Helper.set_attr_int(uniform_dequantize_node, "quantization_min_val", self.quantization_min_val) + Helper.set_attr_int(uniform_dequantize_node, "quantization_max_val", self.quantization_max_val) + Helper.set_attr_dtype(uniform_dequantize_node, "Tout", dtypes.float32) + + if "quantization_axis" in uniform_quantize_node.attr: + uniform_dequantize_node.attr["quantization_axis"].CopyFrom( + uniform_quantize_node.attr["quantization_axis"] + ) + if "Tin" in uniform_quantize_node.attr: + uniform_dequantize_node.attr["Tin"].CopyFrom(uniform_quantize_node.attr["Tout"]) + + parent_node_name = Helper.node_name_from_input(quantize_node.input[0]) + + self.graph_analyzer.add_node(zero_point_node, None, [uniform_quantize_node.name]) + self.graph_analyzer.add_node(scale_node, None, [uniform_quantize_node.name]) + + quantize_output_node_name = set() + for node_name in self.graph_info[quantize_node_name].outputs: + quantize_output_node_name.add(node_name) + self.graph_analyzer.replace_single_node( + uniform_quantize_node, + [parent_node_name], + quantize_node_name, + [i for i in quantize_output_node_name], + quantize_node_name, + ) + + dequantize_output_node_name = set() + for node_name in self.graph_info[dequantize_node_name].outputs: + dequantize_output_node_name.add(node_name) + self.graph_analyzer.replace_single_node( + uniform_dequantize_node, + [uniform_quantize_node.name], + dequantize_node_name, + [i for i in dequantize_output_node_name], + dequantize_node_name, + ) + + self.graph_analyzer.remove_node(quantize_node_name) + self.graph_analyzer.remove_node(dequantize_node_name) + + return self.graph_analyzer.dump_graph() diff --git a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py index a1d5e4dcd9b..2d1fdbb558b 100644 --- a/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py +++ b/neural_compressor/adaptor/tf_utils/graph_rewriter/qdq/insert_qdq_pattern.py @@ -62,6 +62,7 @@ def __init__( self.llm_weight_minmax = llm_weight_minmax self.node_details = namedtuple("node_details", ["node", "output"]) self.node_name_mapping = {} + self.min_max_name_value_dict = {} self.check_op_list = { "ConcatV2", "Conv2D", @@ -217,7 +218,7 @@ def do_transformation(self): if each_input == deq_node_name: self.g_qdq.node_name_details[next_node_name].node.input[input_index] = rep_dequantize_node.name - return self.g_qdq.dump_graph() + return self.g_qdq.dump_graph(), self.min_max_name_value_dict def _check_op_list(self, node_type): """Check if the node_type in the allowed op list.""" @@ -596,6 +597,8 @@ def _insert_qdq_pattern_for_weight_node( min_value = np.min(min_max_values[computational_node.name + "__min"]) max_value = np.max(min_max_values[computational_node.name + "__max"]) + self.min_max_name_value_dict[min_name] = min_value + self.min_max_name_value_dict[max_name] = max_value min_node = Helper.create_constant_node(min_name, min_value, dtypes.float32, device="cpu") max_node = Helper.create_constant_node(max_name, max_value, dtypes.float32, device="cpu") if "BatchMatMul" in host_op_type and "BatchMatMul" not in weight_node.op: diff --git a/neural_compressor/adaptor/tf_utils/util.py b/neural_compressor/adaptor/tf_utils/util.py index c2716d55b9c..abe11bcd09d 100644 --- a/neural_compressor/adaptor/tf_utils/util.py +++ b/neural_compressor/adaptor/tf_utils/util.py @@ -46,6 +46,9 @@ "2.14.0202335", "2.14.dev202335", "2.15.0202341", + "2.16.1", + "2.17.0", + "2.18.0", ) diff --git a/neural_compressor/strategy/strategy.py b/neural_compressor/strategy/strategy.py index 384bb7c58af..f44c9b22ad7 100644 --- a/neural_compressor/strategy/strategy.py +++ b/neural_compressor/strategy/strategy.py @@ -379,7 +379,8 @@ def _prepare_tuning(self): # query capability and build tuning space self.capability = self.capability or self.adaptor.query_fw_capability(self.model) logger.debug(self.capability) - self.tuning_space = self.tuning_space or self.build_tuning_space(self.config) + # self.tuning_space = self.tuning_space or self.build_tuning_space(self.config) + self.tuning_space = self.build_tuning_space(self.config) self.algo_scheduler = self.algo_scheduler or self._initialize_algo_scheduler() self._eval_baseline() @@ -483,6 +484,10 @@ def traverse(self): self._prepare_tuning() traverse_start_time = time() for op_tuning_cfg in self.next_tune_cfg(): + # op_tuning_cfg[('resnet_model/max_pooling2d/MaxPool', 'pooling')].act_dtype='fp32' + for k in op_tuning_cfg: + if k[1] == "pooling": + op_tuning_cfg[k].act_dtype = "fp32" tuning_start_time = time() self.trials_count += 1 tune_cfg = self._tune_cfg_converter(op_tuning_cfg) diff --git a/neural_compressor/tensorflow/algorithms/static_quant/keras.py b/neural_compressor/tensorflow/algorithms/static_quant/keras.py index a93b225c84e..ea49ae30342 100644 --- a/neural_compressor/tensorflow/algorithms/static_quant/keras.py +++ b/neural_compressor/tensorflow/algorithms/static_quant/keras.py @@ -91,6 +91,39 @@ def __init__(self, framework_specific_info): os.makedirs(DEFAULT_WORKSPACE) self.tmp_dir = (DEFAULT_WORKSPACE + "tmp_model.keras") if self.keras3 else (DEFAULT_WORKSPACE + "tmp_model") + def _check_itex(self): + """Check if the IntelĀ® Extension for TensorFlow has been installed.""" + try: + import intel_extension_for_tensorflow + except: + raise ImportError( + "The IntelĀ® Extension for TensorFlow is not installed. " + "Please install it to run models on ITEX backend" + ) + + def convert_bf16(self): + """Execute the BF16 conversion.""" + tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") + model = self.pre_optimized_model + + for layer in model.layers: + if layer.name in self.bf16_ops: + layer.dtype = "mixed_bfloat16" + + model.save(self.tmp_dir) + converted_model = tf.keras.models.load_model(self.tmp_dir) + tf.keras.mixed_precision.set_global_policy("float32") + + return converted_model + + # (TODO) choose the properly quantize mode + def _check_quantize_mode(self, model): + """Check what quantize mode to use.""" + for layer in model.layers: + if "ReLU" in layer.__class__.__name__: + return "MIN_FIRST" + return "SCALED" + def _set_weights(self, qmodel, layer_weights): """Set fp32 weights to qmodel.""" for qlayer in qmodel.layers: @@ -265,11 +298,18 @@ def fuse_conv_bn(conv_weight, bn_weight, conv_type="Conv2D", eps=1.0e-5): beta = bn_weight[1] mean = bn_weight[2] else: - gamma = 1.0 - beta = bn_weight[0] - mean = bn_weight[1] - if conv_type == "DepthwiseConv2D": - var = bn_weight[2].reshape(1, 1, bn_weight[2].shape[0], 1) + if ( + idx > 0 + and layer.__class__.__name__ == "BatchNormalization" + and fp32_layers[idx - 1].__class__.__name__ == "Conv2D" + ): + conv_name = fp32_layers[idx - 1].name + conv_weight = self.conv_weights[conv_name] + bn_weight = self.bn_weights[layer.name] + conv_type = fp32_layers[idx - 1].__class__.__name__ + + self.layer_weights[conv_name] = fuse_conv_bn(conv_weight, bn_weight, conv_type, layer.epsilon) + self.fold_conv.append(conv_name) else: var = bn_weight[2].reshape(1, 1, 1, bn_weight[2].shape[0]) @@ -432,6 +472,59 @@ def _calibrate(self, model, dataloader=None, calib_interation=None): return quantized_model + @dump_elapsed_time(customized_msg="Model inference") + def evaluate( + self, + model, + dataloader, + postprocess=None, + metrics=None, + measurer=None, + iteration=-1, + fp32_baseline=False, + ): + """The function is used to run evaluation on validation dataset. + + Args: + model (object): The model to do calibration. + dataloader (generator): generate the data and labels. + postprocess (object, optional): process the result from the model + metric (object, optional): Depends on model category. Defaults to None. + measurer (object, optional): for precise benchmark measurement. + iteration(int, optional): control steps of mini-batch + fp32_baseline (boolean, optional): only for compare_label=False pipeline + """ + # use keras object + keras_model = model.model + logger.info("Start to evaluate the Keras model.") + results = [] + for idx, (inputs, labels) in enumerate(dataloader): + # use predict on batch + if measurer is not None: + measurer.start() + predictions = keras_model.predict_on_batch(inputs) + measurer.end() + else: + predictions = keras_model.predict_on_batch(inputs) + + if self.fp32_preds_as_label: + self.fp32_results.append(predictions) if fp32_baseline else results.append(predictions) + + if postprocess is not None: + predictions, labels = postprocess((predictions, labels)) + if metrics: + for metric in metrics: + if not hasattr(metric, "compare_label") or ( + hasattr(metric, "compare_label") and metric.compare_label + ): + metric.update(predictions, labels) + if idx + 1 == iteration: + break + + acc = 0 if metrics is None else [metric.result() for metric in metrics] + + return acc if not isinstance(acc, list) or len(acc) > 1 else acc[0] + def query_fw_capability(self, model): """The function is used to return framework tuning capability. @@ -514,7 +607,7 @@ def tuning_cfg_to_fw(self, tuning_cfg): """Parse tune_config and set framework variables. Args: - tuning_cfg (dict): The dict of tuning config. + tuning_cfg (dict): The dict of tunning config. """ self.quantize_config["calib_iteration"] = tuning_cfg["calib_iteration"] self.quantize_config["device"] = self.device @@ -603,6 +696,31 @@ def _get_specified_version_cfg(self, data): return default_config + def get_version(self): + """Get the current backend version information. + + Returns: + [string]: version string. + """ + return self.cur_config["version"]["name"] + + def get_precisions(self): + """Get supported precisions for current backend. + + Returns: + [string list]: the precisions' name. + """ + return self.cur_config["precisions"]["names"] + + def get_op_types(self): + """Get the supported op types by all precisions. + + Returns: + [dictionary list]: A list composed of dictionary which key is precision + and value is the op types. + """ + return self.cur_config["ops"] + def get_quantization_capability(self): """Get the supported op types' quantization capability. @@ -762,6 +880,9 @@ def convert(self, BN_fused_layers=None, conv_names=None, q_layer_dict=None): if self.keras3: layer._inbound_nodes.clear() + if self.keras3: + layer._inbound_nodes.clear() + cur_layer = q_layer_dict[layer.name] if q_layer_dict and layer.name in q_layer_dict else layer x = cur_layer(input_tensors) output_tensor_dict[layer.name] = x @@ -773,3 +894,38 @@ def convert(self, BN_fused_layers=None, conv_names=None, q_layer_dict=None): self.model_outputs.append(x) return tf.keras.models.Model(inputs=self.model.inputs, outputs=self.model_outputs) + + def insert_quant_layers(self, q_layer_dict=None): + """Insert FakeQuant or QDQ layers before the target layers and replace + Keras layers to Quantized layers. + + Args: + q_layer_dict: The dict mapping from layers to be replacement to the quantized layers. + """ + self.input_layer_dict = self._create_input_dict() + output_tensor_dict = {"keras.Input": self.model.input} + + for idx, layer in enumerate(self.model.layers): + if layer.__class__.__name__ == "InputLayer": + output_tensor_dict[layer.name] = output_tensor_dict["keras.Input"] + continue + + input_tensors = ( + output_tensor_dict["keras.Input"] + if idx == 0 + else [output_tensor_dict[input_layer] for input_layer in self.input_layer_dict[layer.name]] + ) + while isinstance(input_tensors, list) and len(input_tensors) == 1: + input_tensors = input_tensors[0] + + if self.keras3: + layer._inbound_nodes.clear() + + cur_layer = q_layer_dict[layer.name] if q_layer_dict and layer.name in q_layer_dict else layer + x = cur_layer(input_tensors) + + output_tensor_dict[layer.name] = x + if layer.name in self.model.output_names: + self.model_outputs.append(x) + + return tf.keras.models.Model(inputs=self.model.inputs, outputs=self.model_outputs) diff --git a/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py b/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py index 3bf9cff80af..963e769b576 100644 --- a/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py +++ b/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py @@ -161,7 +161,6 @@ def _tuning_cfg_to_fw(self, tuning_cfg): if "activation" in tuning_cfg["op"][each_op_info]: is_asymmetric = tuning_cfg["op"][each_op_info]["activation"]["scheme"] == "asym" self.quantize_config["op_wise_config"][op_name] = (is_perchannel, algorithm, is_asymmetric, weight_bit) - self.fp32_ops = fp32_ops self.bf16_ops = bf16_ops @@ -731,8 +730,326 @@ def get_optype_wise_ability(self): res[op[1]] = {"activation": {"dtype": ["bf16"]}, "weight": {"dtype": ["bf16"]}} return res + def _pre_hook_for_qat(self, dataloader=None): + """Pre hook for QAT.""" + self.model.model = self.qat_convert(self.model.model) + + def _post_hook_for_qat(self): + """Post hook for QAT.""" + pass + + def _pre_eval_hook(self, model): + """Pre evaluation hook.""" + return model + + # Add keyword arguments unpacking + def _post_eval_hook(self, model, **kwargs): + """Post evaluation hook.""" + pass + + def save(self, model, path): + """Save model to the path.""" + pass + + # this function is used to convert keras QAT model to pb in old QAT implementation, + # and it's not used in refactored QAT + def convert(self, model, source, destination): # pragma: no cover + """The function is used to convert a source model format to another. + + Args: + model (neural_compressor.model): base model to be converted. + source (string): The source model format. + destination (string): The destination model format. + """ + assert source.lower() == "qat" and destination.lower() == "default" + capability = self._query_fw_capability(model) + + quantize_config = {"op_wise_config": {}} + for each_op_info in capability["opwise"]: + is_perchannel = False + weight_bit = 7.0 + for op_cap in capability["opwise"][each_op_info]: + if "activation" in op_cap and "quant_mode" in op_cap["activation"]: + activation = op_cap["activation"] + if "weight" in op_cap: + weight = op_cap["weight"] + is_perchannel = True if weight["granularity"][0] == "per_channel" else False + algorithm = activation["algorithm"][0] + is_asymmetric = False + if "activation" in op_cap: + is_asymmetric = True if activation["scheme"][0] == "asym" else False + + quantize_config["op_wise_config"][each_op_info[0]] = ( + is_perchannel, + algorithm, + is_asymmetric, + weight_bit, + ) + from neural_compressor.tensorflow.quantization.utils.graph_converter import GraphConverter + + tmp_graphdef = copy.deepcopy(model.graph_def) + for i in tmp_graphdef.node: + if i.op == "Const" and i.input: + i.ClearField("input") + model.graph_def = tmp_graphdef + converter = GraphConverter( + model, + qt_config=quantize_config, + int8_sequences=self.op_wise_sequences, + fake_quant=True, + new_api=self.new_api, + performance_only=self.performance_only, + use_bf16=self.use_bf16, + ) + + return converter.convert() + + def qat_convert(self, model, quantize_recipe=None): + """Convert a fp32 'tf.keras' model to be a int8 one with quantization aware training implementation. + + Args: + model (tf.keras.Model): The model to be quantized, expected to be a Keras Functional or Sequential model. + quantize_recipe (dict): A dict that decide whether given layers should be quantized. + + Returns: + converted_model (tf.keras.Model): Quantized model with fake quant nodes inserted. + """ + assert isinstance(model, tf.keras.Model), ( + "The model to be converted is expected to be " + "a `tf.keras.Model` instance. You should not pass an instance of type: {input}.".format( + input=model.__class__.__name__ + ) + ) + + assert model.__class__.__name__ in [ + "Functional", + "Sequential", + ], "Only `Functional` or `Sequential` keras model is supported for QAT." + + from neural_compressor.tensorflow.quantization.utils.quantize_graph.qat.quantize_config import global_config + from neural_compressor.tensorflow.quantization.utils.quantize_graph.qat.quantize_helper import ( + init_quantize_config, + qat_clone_function, + ) + + config = init_quantize_config(model, quantize_recipe) + q_model = tf.keras.models.clone_model(model, input_tensors=None, clone_function=qat_clone_function) + global_config.clear() + + return q_model + + @dump_elapsed_time("Pass recover model") + def recover_tuned_model(self, model, q_config): + """Execute the recover process on the specified model. + + Args: + tune_cfg (dict): quantization configuration + model (tf.compat.v1.GraphDef): fp32 model + q_config (dict): recover configuration + + Returns: + tf.compat.v1.GraphDef: the quantized model + """ + from neural_compressor.tensorflow.quantization.utils.graph_rewriter.generic.pre_optimize import PreOptimization + + self.pre_optimizer_handle = PreOptimization(model, self.new_api, self.device) + self.pre_optimized_model = self.pre_optimizer_handle.get_optimized_model(self.itex_mode) + model.graph_def = self.pre_optimized_model.graph_def + + from neural_compressor.tensorflow.quantization.utils.graph_converter_without_calib import ( + GraphConverterWithoutCalib, + ) + + converter = GraphConverterWithoutCalib( + model, + recover_config=q_config, + new_api=self.new_api, + performance_only=self.performance_only, + use_bf16=self.use_bf16, + ) + + return converter.convert_without_calib() + + def diagnosis_helper(self, fp32_model, quan_model, tune_cfg, save_path): + """Tensorflow diagnosis helper function.""" + from neural_compressor.tensorflow.quantization.utils.utility import tf_diagnosis_helper + + return tf_diagnosis_helper(fp32_model, quan_model, tune_cfg, save_path) + + def get_output_op_names(self, qmodel): + """Get the oupur OPs's names.""" + from neural_compressor.tensorflow.quantization.utils.graph_util import GraphAnalyzer + + graph_def = GraphAnalyzer().parse_graph(qmodel.graph_def) + output_op_names = set() + + def _add_output_op_name(opname): + if opname.endswith("_dequantize"): + output_op_names.add(opname[: -len("_dequantize")]) # pylint: disable=no-member + elif opname.endswith("__dequant"): + pass + else: + output_op_names.add(opname) # pylint: disable=no-member + + for output_opname in qmodel.output_node_names: + op_count = 0 + stack = [output_opname] + while stack: + opname = stack.pop() + while True: + op_count += 1 + if opname not in graph_def: + break + op = graph_def[opname] + if op.node.op == "Dequantize": + _add_output_op_name(opname) + break + next_opnames = op.node.input + if not next_opnames: + break + elif len(next_opnames) > 1: + stack += next_opnames[1:] + + opname = next_opnames[0] + + output_op_names = list(output_op_names) + logger.debug(f"output op names: {output_op_names}") + return output_op_names + + def calculate_op_sensitivity( + self, model, dataloader, tune_cfg, output_op_names, confidence_batches, fallback=True, requantize_cfgs=None + ): + """Compute the op sensitivity. + + The sensitivity metric is the mse between the output of the last quantized op of + the quantized model and the output of its corresponding op in the fp32 model. + + 1. Backup the tune cfg + 2. Fallback each int8 op and compute its mse if use fallback (with 'fallback == True'), + or re-quantize each fp32 op(fallen back in the previous stage) and compute its MSE if not. + 3. Sorted op name list according to its MSE + + Args: + fp32_model: The fp32 model. + dataloader: the dataloader with full dataset. + tune_cfg: tuning config + fallback: denote fallback stage or re-quantize stage + requantize_cfgs: the dict of tuning configs for all re-quantizable ops + + Returns: + A list of op names, sorted by its MSE sensitivity. + """ + fp32_op_cfg = {"activation": {"dtype": "fp32", "quant_mode": "fp32"}, "weight": {"dtype": "fp32"}} + + if fallback: + ops_list = [ + op + for op, config in tune_cfg["op"].items() + if config["activation"]["quant_mode"] in ("static", "dynamic") + ] + replace_cfgs = {op: fp32_op_cfg for op in tune_cfg["op"]} + else: + ops_list = [ + op + for op, config in tune_cfg["op"].items() + if config["activation"]["quant_mode"] == "fp32" and op in requantize_cfgs + ] + replace_cfgs = requantize_cfgs + + # Step2. compute mse + mse_result = self._get_mse_order( + model, deepcopy(tune_cfg), replace_cfgs, ops_list, dataloader, output_op_names, confidence_batches + ) + + # Step3. sort + mse_order = [op for op, _ in sorted(mse_result.items(), key=lambda i: i[1])] + logger.debug("Dump MSE order:") + for op in mse_order: + logger.debug(f"{op}: {mse_result[op]}") + return mse_order + + def _get_mse_order( + self, fp32_model, tune_cfg, replace_cfgs, ops_lst, dataloader, output_op_names, confidence_batches + ): + """Compute MSE.""" + op_cfg = tune_cfg["op"] + mse_result = {} + partial_dataloader = self._partial_dataloader(dataloader, confidence_batches) + + fp32_output = self._inference_model_on_batches(fp32_model, tune_cfg, partial_dataloader, output_op_names) + + for op in ops_lst: + # backup and set replace tuning config + backup_cfg = op_cfg[op] + op_cfg[op] = replace_cfgs[op] + + # quantize and inference the model + q_model = self.quantize(tune_cfg, fp32_model, partial_dataloader) + q_output = self._inference_model_on_batches(q_model, tune_cfg, partial_dataloader, output_op_names) + + mse_result[op] = self._calculate_mse(fp32_output, q_output) + + # recover tune_cfg + op_cfg[op] = backup_cfg + + return mse_result + + def _partial_dataset_of(self, dataloader, confidence_batches): + """Partial dataset.""" + from neural_compressor.tensorflow.utils import DummyDataset, DummyDatasetV2 + + if isinstance(dataloader.dataset, DummyDataset) or isinstance(dataloader.dataset, DummyDatasetV2): + assert isinstance(confidence_batches, int) + ds = copy.deepcopy(dataloader.dataset) + ds.dataset = ds.dataset[:confidence_batches] + return ds + else: + return dataloader.dataset.take(confidence_batches) + + def _partial_dataloader(self, dataloader, confidence_batches): + """Partial dataloader.""" + return type(dataloader)( + dataset=self._partial_dataset_of(dataloader, confidence_batches), + batch_size=dataloader.batch_size, + last_batch=dataloader.last_batch, + collate_fn=dataloader.collate_fn, + sampler=dataloader.sampler, + batch_sampler=dataloader.batch_sampler, + num_workers=dataloader.num_workers, + pin_memory=dataloader.pin_memory, + shuffle=dataloader.shuffle, + distributed=dataloader.distributed, + ) + + def _calculate_mse(self, fp32_output, q_output): + """MSE calculation.""" + result = [] + for i, j in zip(fp32_output, q_output): + result.append(np.square(i - j).mean()) + return np.array(result).mean() + + def _inference_model_on_batches(self, model, tune_cfg, dataloader, output_op_names): + """Inference model on batches.""" + from neural_compressor.tensorflow.quantization.utils.utility import generate_feed_dict + + input_tensors = model.input_tensor + output_tensors = [] + for op in output_op_names: + for tensor in model.graph.get_operation_by_name(op).outputs: + output_tensors.append(tensor) + + predictions = [] + for index, (inputs, _) in enumerate(dataloader): + feed_dict = generate_feed_dict(input_tensors, inputs) + + pred = model.sess.run(output_tensors, feed_dict) + for item in pred: + predictions.append(item) + + return predictions + -class Tensorflow_ITEXAdaptor(TensorFlowAdaptor): # pragma: no cover +class Tensorflow_ITEXAdaptor(TensorFlowAdaptor): """Tensorflow ITEX Adaptor Class.""" def __init__(self, framework_specific_info): diff --git a/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.yaml b/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.yaml index acd8857eda0..037e9d01134 100644 --- a/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.yaml +++ b/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.yaml @@ -16,7 +16,7 @@ --- - version: - name: ['2.11.0202242', '2.11.0202250', '2.11.0202317', '2.11.0202323', '2.14.0202335', '2.14.dev202335', '2.15.0202341'] + name: ['2.11.0202242', '2.11.0202250', '2.11.0202317', '2.11.0202323', '2.14.0202335', '2.14.dev202335', '2.15.0202341', '2.16.1', '2.17.0', '2.18.0'] bf16: ["_MklLayerNorm", "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Conv3D", "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2", "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", "DepthwiseConv2dNativeBackpropInput", "GRUBlockCell", @@ -150,7 +150,7 @@ - version: - name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '2.9.0', '2.9.1', '2.10.0', '2.11.0', '2.12.0', '2.13.0', '2.14.0', '2.14.1', '2.15.0', '2.15.1', '1.15.0-up1', '1.15.0-up2', '1.15.0-up3'] + name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '2.9.0', '2.9.1', '2.10.0', '2.11.0', '2.12.0', '2.13.0', '1.15.0-up1', '1.15.0-up2', '1.15.0-up3'] bf16: ['Conv2D', 'Conv3D', 'MatMul', 'BatchMatMul', 'MaxPool', 'MaxPool3D', 'AvgPool', 'AvgPool3D', 'DepthwiseConv2dNative'] fp32: ['*'] # '*' means all op types diff --git a/neural_compressor/tensorflow/keras/layers/__init__.py b/neural_compressor/tensorflow/keras/layers/__init__.py index ee4249138b3..0e966814f6e 100644 --- a/neural_compressor/tensorflow/keras/layers/__init__.py +++ b/neural_compressor/tensorflow/keras/layers/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/neural_compressor/tensorflow/keras/quantization/config.py b/neural_compressor/tensorflow/keras/quantization/config.py index 1cebc23b0e2..5d050e0c043 100644 --- a/neural_compressor/tensorflow/keras/quantization/config.py +++ b/neural_compressor/tensorflow/keras/quantization/config.py @@ -120,13 +120,13 @@ def get_model_info(model) -> List[Tuple[str, Callable]]: """Get concrete node names for supported operators.""" white_list = [ "Dense", - "Conv2D", - "DepthwiseConv2D", - "SeparableConv2D", - "AvgPool2D", - "AveragePooling2D", - "MaxPool2D", - "MaxPooling2D", + # "Conv2d", + # "DepthwiseConv2D", + # "SeparableConv2D", + # "AvgPool2D", + # "AveragePooling2D", + # "MaxPool2D", + # "MaxPooling2D", ] filter_result = [] diff --git a/neural_compressor/tensorflow/quantization/algorithm_entry.py b/neural_compressor/tensorflow/quantization/algorithm_entry.py index 5a71e197753..3f7f244e22d 100644 --- a/neural_compressor/tensorflow/quantization/algorithm_entry.py +++ b/neural_compressor/tensorflow/quantization/algorithm_entry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ def static_quant_entry( if isinstance(model, KerasModel): assert valid_keras_format(model.model), "Only Sequential or Functional models are supported now." framework = KerasAdaptor - elif TFConfig.global_config["backend"] == "itex": + elif framework_specific_info["backend"] == "itex": framework = Tensorflow_ITEXAdaptor else: framework = TensorFlowAdaptor diff --git a/neural_compressor/tensorflow/quantization/config.py b/neural_compressor/tensorflow/quantization/config.py index 10db0249f35..6f100cbd003 100644 --- a/neural_compressor/tensorflow/quantization/config.py +++ b/neural_compressor/tensorflow/quantization/config.py @@ -111,11 +111,11 @@ def register_supported_configs(cls) -> List[OperatorConfig]: """Register supported config.""" supported_configs = [] static_quant_config = StaticQuantConfig( - weight_dtype=["int8", "bf16", "fp32"], + weight_dtype=["int8", "fp32"], weight_sym=[True, False], weight_granularity=["per_tensor", "per_channel"], weight_algorithm=["minmax", "kl"], - act_dtype=["int8", "bf16", "fp32"], + act_dtype=["int8", "fp32"], act_sym=[True, False], act_granularity=["per_tensor"], act_algorithm=["minmax", "kl"], @@ -141,11 +141,10 @@ def register_supported_configs(cls) -> List[OperatorConfig]: def get_model_info(self, model) -> List[Tuple[str, Callable]]: """Get concrete node names for supported operators.""" white_list = [ + "MatMul", "Conv2D", - "FusedBatchNormV3", "Conv3D", "_MklFusedInstanceNorm", - "MatMul", "BatchMatMul", "BatchMatMulV2", "DepthwiseConv2dNative", @@ -155,8 +154,9 @@ def get_model_info(self, model) -> List[Tuple[str, Callable]]: "MaxPool", "MaxPool3D", "AvgPool", + "_MklFusedInstanceNorm", "Conv2DBackpropInput", - "Conv3DBackpropInputV2", + "Conv2DBackpropInputV2", ] for key in self._local_config.keys(): if key in white_list: diff --git a/neural_compressor/tensorflow/quantization/utils/graph_converter.py b/neural_compressor/tensorflow/quantization/utils/graph_converter.py index e3c1c640c86..1c4db65f562 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_converter.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_converter.py @@ -243,6 +243,7 @@ def _inference(self, model): # INC needs turn off ITEX optimization pass in calibration stage. # TODO ITEX will provide API to replace setting environment variable. os.environ["ITEX_REMAPPER"] = "0" + sess = model.sess iter_op = model.iter_op input_tensor = model.input_tensor @@ -330,6 +331,9 @@ def check_shape(tensor, data): os.environ["ITEX_REMAPPER"] = "1" def _inference_llm(self, model): + logger.info("Start sampling on calibration dataset.") + f = tf.io.gfile.GFile("calib_qdq.pb", "wb") + f.write(model.graph_def.SerializeToString()) input_tensor_names = model.input_tensor_names auto_trackable = model.model infer = auto_trackable.signatures["serving_default"] @@ -342,7 +346,7 @@ def _inference_llm(self, model): for i, input_tensor_name in enumerate(input_tensor_names): feed_dict[input_tensor_name] = inputs[i] - _ = infer(**feed_dict) + pred = infer(**feed_dict) if idx >= self.calib_iteration: break @@ -904,6 +908,7 @@ def _insert_qdq_pairs(self): # TODO: this is a workaround to make Min/Max node be completely eliminated in int8 graph # after enabling pad+conv2d in new API. non_pad_ops = list(list(set(self.fp32_ops).union(set(self.bf16_ops)))) + sampling_graph_def = FusePadWithFP32Conv2DOptimizer( sampling_graph_def, non_pad_ops, self._tmp_model.input_node_names, self.op_wise_config, self.new_api, True ).do_transformation() @@ -918,6 +923,7 @@ def _insert_qdq_pairs(self): sampling_graph_def.library.CopyFrom(self.model.graph_def.library) self._sampling_model.graph_def = sampling_graph_def self._sampling_model.output_tensor_names = output_tensor_names + tmp_dump_file = tempfile.mkstemp(suffix=".log")[1] with CaptureOutputToFile(tmp_dump_file): self._inference(self._sampling_model) @@ -975,8 +981,12 @@ def _convert_qdq(self): ).do_transformation() self._tmp_graph_def = ShareQDQForItexYPatternOptimizer(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = MergeDuplicatedQDQOptimizer(self._tmp_graph_def).do_transformation() + # self._tmp_graph_def = MergeDuplicatedQDQOptimizer(self._tmp_graph_def).do_transformation() + from neural_compressor.tensorflow.quantization.utils.graph_rewriter.int8.convert_qdq_to_uniform_qdq import ( + ConvertUniformQDQOptimizer, + ) + self._tmp_graph_def = ConvertUniformQDQOptimizer(self._tmp_graph_def).do_transformation() self._tmp_graph_def.library.CopyFrom(self.model.graph_def.library) self._tmp_model.graph_def = self._tmp_graph_def self._tmp_model.graph_def.library.CopyFrom(self.model.graph_def.library) diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_conv.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_conv.py index e894ee8d3cb..abedcab5445 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_conv.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_conv.py @@ -49,6 +49,7 @@ def do_transformation(self): ) padding_tensor_dict = {} + for node_combination in target_nodes: conv_name = node_combination[1] @@ -59,16 +60,16 @@ def do_transformation(self): is_perchannel = self.cfg[conv_name][0] - # Line 55 to line 65 should be removed once the TFDO enabling the single quantized - # conv2D supporting. - if len(pattern) == 2: - # TODO we need to enable single quantizedconv2d with s8 input. - if not is_perchannel and not cur_graph.has_positive_input(conv_name): - continue - # TFDO has the limitation that the single QuantizedConv2DPerchannel doesn't - # support padding_list filed. - if is_perchannel: - continue + # # Line 55 to line 65 should be removed once the TFDO enabling the single quantized + # # conv2D supporting. + # if len(pattern) == 2: + # # TODO we need to enable single quantizedconv2d with s8 input. + # if not is_perchannel and not cur_graph.has_positive_input(conv_name): + # continue + # # TFDO has the limitation that the single QuantizedConv2DPerchannel doesn't + # # support padding_list filed. + # if is_perchannel: + # continue if conv_name in self.excluded_conv: continue diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py index c3cf5a4b62c..97a823d72df 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/fuse_pad_with_fp32_conv.py @@ -59,16 +59,16 @@ def do_transformation(self): is_perchannel = self.cfg[conv_name][0] - # Line 55 to line 65 should be removed once the TFDO enabling the single quantized - # conv2D supporting. - if len(pattern) == 2: - # TODO we need to enable single quantizedconv2d with s8 input. - if not is_perchannel and not cur_graph.has_positive_input(conv_name): - continue - # TFDO has the limitation that the single QuantizedConv2DPerchannel doesn't - # support padding_list filed. - if is_perchannel: - continue + # # Line 55 to line 65 should be removed once the TFDO enabling the single quantized + # # conv2D supporting. + # if len(pattern) == 2: + # # TODO we need to enable single quantizedconv2d with s8 input. + # if not is_perchannel and not cur_graph.has_positive_input(conv_name): + # continue + # # TFDO has the limitation that the single QuantizedConv2DPerchannel doesn't + # # support padding_list filed. + # if is_perchannel: + # continue if conv_name in self.excluded_conv: continue diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/pre_optimize.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/pre_optimize.py index beac92c1b8d..d30079364e7 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/pre_optimize.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/generic/pre_optimize.py @@ -185,10 +185,10 @@ def get_optimized_model(self, itex_mode=False): # Put FuseDecomposedBNOptimizer before GraphFoldConstantOptimizer # The 'Sub' op in the small decomposed ops of BN will be converted to const by GraphFoldConstantOptimizer. # Then the FuseDecomposedBNOptimizer can't fuse the small decomposed ops to BN. - if self.new_api: - self._tmp_graph_def = FuseDecomposedBNOptimizer(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = FuseDecomposedINOptimizer(self._tmp_graph_def).do_transformation() - self._tmp_graph_def = FuseLayerNormOptimizer(self._tmp_graph_def).do_transformation() + # if self.new_api: + # self._tmp_graph_def = FuseDecomposedBNOptimizer(self._tmp_graph_def).do_transformation() + # self._tmp_graph_def = FuseDecomposedINOptimizer(self._tmp_graph_def).do_transformation() + # self._tmp_graph_def = FuseLayerNormOptimizer(self._tmp_graph_def).do_transformation() self._tmp_graph_def = GraphFoldConstantOptimizer(self._tmp_graph_def).do_transformation() diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py new file mode 100644 index 00000000000..239c6dbd029 --- /dev/null +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/convert_qdq_to_uniform_qdq.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fuse QuantizedMatMul with Requantize/Dequantize Graph Rewriter.""" + +import numpy as np +import tensorflow as tf +from tensorflow.core.framework import attr_value_pb2, node_def_pb2 +from tensorflow.python.framework import dtypes, tensor_util + +from neural_compressor.tensorflow.quantization.utils.graph_util import GraphAnalyzer +from neural_compressor.tensorflow.quantization.utils.graph_util import GraphRewriterHelper as Helper +from neural_compressor.tensorflow.utils import version1_gt_version2, version1_lt_version2 + +from ..graph_base import GraphRewriterBase + + +class ConvertUniformQDQOptimizer(GraphRewriterBase): + """Fuse newAPI Quantized MatMul Op with the successor Requantize Op.""" + + def __init__(self, model, device="cpu"): + """Initialization.""" + super().__init__(model) + self.device = device + self.graph_analyzer = GraphAnalyzer() + self.graph_analyzer.graph = self.model + self.eps = 1e-05 + self.graph_info = self.graph_analyzer.parse_graph() + + self.uint8_type = dtypes.quint8.as_datatype_enum + self.int8_type = dtypes.qint8.as_datatype_enum + self.float32_type = dtypes.float32.as_datatype_enum + self.qint32_type = dtypes.qint32.as_datatype_enum + + self.quantization_min_val = None + self.quantization_max_val = None + + def _calculate_zp_and_scale(self, min_value, max_value, dtype): + if dtype == attr_value_pb2.AttrValue(type=self.int8_type): + zp = 0 + scale_range = 127 + self.quantization_min_val = -128 + self.quantization_max_val = 127 + elif dtype == attr_value_pb2.AttrValue(type=self.uint8_type): + zp = 128 + scale_range = 255 + self.quantization_min_val = 0 + self.quantization_max_val = 255 + else: + raise ValueError("Unexpected data type for Quantize Op.") + + if isinstance(max_value, float): + return zp, max(abs(max_value), abs(min_value)) / scale_range + + scales = [] + zero_points = [] + for i in range(len(max_value)): + scales.append(max(abs(max_value[i]), abs(min_value[i])) / scale_range) + zero_points.append(zp) + + return zero_points, scales + + def do_transformation(self): + """Fuse the quantized op with the following requantize op. + + Returns: + [graphdef]: the optimized graphdef object + """ + target_nodes = self.graph_analyzer.query_fusion_pattern_nodes([["QuantizeV2"], ["Dequantize"]]) + for i in target_nodes: + shared_quantize_node = False + quantize_node_name = i[0] + dequantize_node_name = i[1] + dequantize_node = self.graph_info[dequantize_node_name].node + # if quantize_node_name in self.graph_info: + quantize_node = self.graph_info[quantize_node_name].node + quantize_min_name = quantize_node.input[1] + quantize_max_name = quantize_node.input[2] + + dtype = quantize_node.attr["T"] + + min_value = self.graph_info[quantize_min_name].node.attr["value"].tensor.float_val[0] + max_value = self.graph_info[quantize_max_name].node.attr["value"].tensor.float_val[0] + + zero_point_value, scale_value = self._calculate_zp_and_scale(min_value, max_value, dtype) + zero_point_name = quantize_min_name[:-4] + "zero_point" + scale_name = quantize_min_name[:-4] + "scale" + + zero_point_node = Helper.create_constant_node(zero_point_name, zero_point_value, dtypes.int32, device="cpu") + scale_node = Helper.create_constant_node(scale_name, scale_value, dtypes.float32, device="cpu") + + uniform_quantize_node = node_def_pb2.NodeDef() + uniform_quantize_node.op = "UniformQuantize" + uniform_quantize_node.name = quantize_node_name + "_UniformQuantize" + uniform_quantize_node.input.extend([quantize_node.input[0], scale_name, zero_point_name]) + Helper.set_attr_int(uniform_quantize_node, "quantization_min_val", self.quantization_min_val) + Helper.set_attr_int(uniform_quantize_node, "quantization_max_val", self.quantization_max_val) + Helper.set_attr_dtype(uniform_quantize_node, "Tin", dtypes.float32) + + if "axis" in quantize_node.attr: + uniform_quantize_node.attr["quantization_axis"].CopyFrom(quantize_node.attr["axis"]) + uniform_quantize_node.attr["Tout"].CopyFrom(quantize_node.attr["T"]) + + uniform_dequantize_node = node_def_pb2.NodeDef() + uniform_dequantize_node.op = "UniformDequantize" + uniform_dequantize_node.name = dequantize_node_name + "_UniformDequantize" + + uniform_dequantize_node.input.extend( + [ + uniform_quantize_node.name, + scale_name, + zero_point_name, + ] + ) + Helper.set_attr_int(uniform_dequantize_node, "quantization_min_val", self.quantization_min_val) + Helper.set_attr_int(uniform_dequantize_node, "quantization_max_val", self.quantization_max_val) + Helper.set_attr_dtype(uniform_dequantize_node, "Tout", dtypes.float32) + + if "quantization_axis" in quantize_node.attr: + uniform_dequantize_node.attr["quantization_axis"].CopyFrom(quantize_node.attr["quantization_axis"]) + if "Tin" in uniform_quantize_node.attr: + uniform_dequantize_node.attr["Tin"].CopyFrom(uniform_quantize_node.attr["Tout"]) + # if not shared_quantize_node: + parent_node_name = Helper.node_name_from_input(quantize_node.input[0]) + + self.graph_analyzer.add_node(zero_point_node, None, [uniform_quantize_node.name]) + self.graph_analyzer.add_node(scale_node, None, [uniform_quantize_node.name]) + + quantize_output_node_name = set() + for node_name in self.graph_info[quantize_node_name].outputs: + quantize_output_node_name.add(node_name) + self.graph_analyzer.replace_single_node( + uniform_quantize_node, + [parent_node_name], + quantize_node_name, + [i for i in quantize_output_node_name], + quantize_node_name, + ) + + dequantize_output_node_name = set() + for node_name in self.graph_info[dequantize_node_name].outputs: + dequantize_output_node_name.add(node_name) + self.graph_analyzer.replace_single_node( + uniform_dequantize_node, + [uniform_quantize_node.name], + dequantize_node_name, + [i for i in dequantize_output_node_name], + dequantize_node_name, + ) + + self.graph_analyzer.remove_node(quantize_node_name) + self.graph_analyzer.remove_node(dequantize_node_name) + + return self.graph_analyzer.dump_graph() diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/fuse_matmul_requantize.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/fuse_matmul_requantize.py index 2fef260b500..27b3b998024 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/fuse_matmul_requantize.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/int8/fuse_matmul_requantize.py @@ -263,14 +263,10 @@ def do_transformation(self): weight_node = self.graph_info[new_node.input[1]].node bias_node = self.graph_info[new_node.input[2]].node + max_input_node = self.graph_info[last_node.input[-1]].node + min_input_node = self.graph_info[last_node.input[-2]].node - max_input_node = None - min_input_node = None - if last_node.op.find("Requantize") != -1 or last_node.op.find("QuantizeV2") != -1: - max_input_node = self.graph_info[last_node.input[-1]].node - min_input_node = self.graph_info[last_node.input[-2]].node - - if max_input_node and max_input_node.op == "Enter": # pragma: no cover + if max_input_node.op == "Enter": # pragma: no cover min_input_parent_name = Helper.node_name_from_input(min_input_node.input[0]) max_input_parent_name = Helper.node_name_from_input(max_input_node.input[0]) min_input_parent_node = self.graph_info[min_input_parent_name].node diff --git a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/qdq/insert_qdq_pattern.py b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/qdq/insert_qdq_pattern.py index 64fc4a69dac..949246fead8 100644 --- a/neural_compressor/tensorflow/quantization/utils/graph_rewriter/qdq/insert_qdq_pattern.py +++ b/neural_compressor/tensorflow/quantization/utils/graph_rewriter/qdq/insert_qdq_pattern.py @@ -551,6 +551,8 @@ def _insert_qdq_pattern_for_weight_node( min_value = -range_value max_value = range_value elif weight_node.op == "ReadVariableOp": + if not self.llm_weight_minmax: + return min_value = self.llm_weight_minmax[weight_node.name][0] max_value = self.llm_weight_minmax[weight_node.name][1] min_value *= range_coefficent diff --git a/neural_compressor/tensorflow/quantization/utils/utility.py b/neural_compressor/tensorflow/quantization/utils/utility.py index 5e3fa83ea90..ea900abd05d 100644 --- a/neural_compressor/tensorflow/quantization/utils/utility.py +++ b/neural_compressor/tensorflow/quantization/utils/utility.py @@ -37,6 +37,21 @@ from neural_compressor.tensorflow.quantization.utils.graph_util import GraphAnalyzer, GraphRewriterHelper +def disable_random(seed=1): + """A Decorator to disable tf random seed.""" + + def decorator(func): + def wrapper(*args, **kw): + tf.compat.v1.disable_eager_execution() + tf.compat.v1.reset_default_graph() + tf.compat.v1.set_random_seed(seed) + return func(*args, **kw) + + return wrapper + + return decorator + + def read_graph(in_graph, in_graph_is_binary=True): """Reads input graph file as GraphDef. @@ -401,11 +416,87 @@ def get_model_input_shape(model): _shape = [item.value for item in _shape] if len(_shape) > 1 and isinstance(_shape[0], int): return _shape[0] - elif isinstance(_shape, list) and hasattr(_shape[0], "value"): - return _shape[0].value return 1 +def get_tensor_val_from_graph_node(graph_node_name_mapping, node_name): + """Get the tensor value for given node name. + + Args: + graph_node_name_mapping: key: node name, val: node + node_name: query node + + Returns: + tensor_val: numpy array + """ + from tensorflow.python.framework import tensor_util + + node = graph_node_name_mapping[node_name] + node_tensor = node.attr["value"].tensor + tensor_val = tensor_util.MakeNdarray(node_tensor) + return tensor_val + + +def int8_node_name_reverse(node): + """Reverse int8 node name.""" + int8_postfix = "_eightbit" + node_name = node.name + if "Quantized" in node.op: + index_postfix = node_name.find(int8_postfix) + if index_postfix != -1: + node_name = node_name[:index_postfix] + return node_name + + +def tf_diagnosis_helper(fp32_model, quan_model, tune_cfg, save_path): + """Tensorflow diagnosis helper function.""" + from ...utils.utility import dump_data_to_local + + fp32_node_mapping = {} + qnode_mapping = {} + for node in fp32_model.graph_def.node: + fp32_node_mapping[node.name] = node + for node in quan_model.graph_def.node: + qnode_mapping[node.name] = node + supported_op_lst = set(["Conv2D", "MatMul", "ConcatV2", "MaxPool", "AvgPool", "DepthwiseConv2dNative"]) + fp32_node_lst = set() + for node in fp32_model.graph_def.node: + if node.op in supported_op_lst: + fp32_node_lst.add(node.name) + int8_node_lst = set() + bf16_node_lst = set() + for node in quan_model.graph_def.node: + node_name = node.name + node_name = int8_node_name_reverse(node) + if "Quantized" in node.op: + int8_node_lst.add(node_name) + elif node.attr["value"].tensor.dtype == tf.dtypes.bfloat16.as_datatype_enum: # pragma: no cover + bf16_node_lst.add(node.name) + else: + continue + inspect_node_lst = fp32_node_lst.intersection(bf16_node_lst.union(int8_node_lst)) + activation_min_max, updated_cfg = _parse_config(quan_model.q_config, tune_cfg, inspect_node_lst) + dump_data_to_local(activation_min_max, save_path, "activation_min_max.pkl") + dump_data_to_local(updated_cfg, save_path, "cfg.pkl") + + return inspect_node_lst, updated_cfg + + +def _parse_config(q_config, cfg, op_list): + """Parse q_config and get dequantize min max value.""" + activation_min_max = {} + if "__requant_min_max" in q_config: + for node_name, val in q_config["__requant_min_max"].items(): + node_name = node_name.split("_eightbit_requant_range")[0] + if node_name in op_list: + activation_min_max[node_name] = {"min": val[0], "max": val[1]} + updated_cfg = {"op": {}} + for op_name_and_type in cfg["op"].keys(): + if op_name_and_type[0] in op_list: + updated_cfg["op"][op_name_and_type] = cfg["op"][op_name_and_type] + return activation_min_max, updated_cfg + + def generate_feed_dict(input_tensor, inputs): """Generate feed dict helper function.""" if len(input_tensor) == 1: diff --git a/neural_compressor/tensorflow/utils/constants.py b/neural_compressor/tensorflow/utils/constants.py index fc79b116e17..7dc9dd54c8d 100644 --- a/neural_compressor/tensorflow/utils/constants.py +++ b/neural_compressor/tensorflow/utils/constants.py @@ -21,6 +21,9 @@ "2.14.0202335", "2.14.dev202335", "2.15.0202341", + "2.16.1", + "2.17.0", + "2.18.0", ) TENSORFLOW_DEFAULT_CONFIG = { @@ -45,7 +48,6 @@ "Conv3D": "conv3d", "DepthwiseConv2dNative": "conv2d", "FusedBatchNormV3": "batchnorm", - "FusedBatchNorm": "batchnorm", "_MklFusedInstanceNorm": "instancenorm", "MaxPool": "pooling", "MaxPool3D": "pooling", diff --git a/neural_compressor/tensorflow/utils/model_wrappers.py b/neural_compressor/tensorflow/utils/model_wrappers.py index 5740bd882fc..6747805a79b 100644 --- a/neural_compressor/tensorflow/utils/model_wrappers.py +++ b/neural_compressor/tensorflow/utils/model_wrappers.py @@ -68,7 +68,7 @@ def get_model_type(model): if isinstance(model, str): model = os.path.abspath(os.path.expanduser(model)) if ( - ((model.endswith(".h5") or model.endswith(".keras")) and os.path.isfile(model)) + (model.endswith(".h5") and os.path.isfile(model)) or is_saved_model_format(os.path.dirname(model)) or (os.path.isdir(model) and is_saved_model_format(model)) ): @@ -354,20 +354,10 @@ def _get_graph_from_saved_model_v3(model, input_tensor_names, output_tensor_name def _get_graph_from_saved_model_v2(saved_model_dir, input_tensor_names, output_tensor_names): - """The version 2 function that get graph from the original keras model. - - Args: - saved_model_dir (string): model path of a temporary saved_model. - input_tensor_names (list of string): input tensor names of the model. - output_tensor_names (list of string): output tensor names of the model. - - Returns: - graph_def (tf.compat.v1.Session): tf.compat.v1.Session object. - input_names (list of string): validated input names. - output_names (list of string): validated output names. - """ from tensorflow.python.saved_model import signature_constants, tag_constants + from neural_compressor.tensorflow.quantization.utils.utility import parse_saved_model + saved_model_exported_names = [signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] saved_model_tags = set([tag_constants.SERVING]) @@ -519,40 +509,42 @@ def try_loading_keras(model, input_tensor_names, output_tensor_names): # pragma output_tensor_names (list of string): output tensor names of the model. Returns: - graph_def (tf.compat.v1.Session): tf.compat.v1.Session object. - input_names (list of string): validated input names. - output_names (list of string): validated output names. + sess (tf.compat.v1.Session): tf.compat.v1.Session object. + input_tensor_names (list of string): validated input_tensor_names. + output_tensor_names (list of string): validated output_tensor_names. """ temp_dir = tempfile.mkdtemp() - if not isinstance(model, tf.keras.Model): - model = tf.keras.models.load_model(model) - keras_format = _check_keras_format(model, temp_dir) - - if keras_format == "saved_model_v2": - try: - graph_def, input_names, output_names = _get_graph_from_saved_model_v2( - temp_dir, input_tensor_names, output_tensor_names - ) - if "_FusedBatchNormEx" in [node.op for node in graph_def.node]: + if tf.version.VERSION > "2.1.0": + if not isinstance(model, tf.keras.Model): + model = tf.keras.models.load_model(model) + keras_format = _check_keras_format(model, temp_dir) + if keras_format == "saved_model_v2": + try: + graph_def, input_names, output_names = _get_graph_from_saved_model_v2( + temp_dir, input_tensor_names, output_tensor_names + ) + if "_FusedBatchNormEx" in [node.op for node in graph_def.node]: + keras_format = "trackable_object" + except: keras_format = "trackable_object" - except: - keras_format = "trackable_object" - - if keras_format == "trackable_object": - try: - graph_def, input_names, output_names = _get_graph_from_original_keras_v2(model) - except: - keras_format = "saved_model_v1" - - if keras_format == "saved_model_v1": # pragma: no cover - try: - tf.keras.backend.set_learning_phase(0) - graph_def, input_names, output_names = _get_graph_from_saved_model_v1(model) - except: - raise ValueError("Not supported keras model type...") + if keras_format == "trackable_object": + try: + graph_def, input_names, output_names = _get_graph_from_original_keras_v2(model, temp_dir) + except: + keras_format = "saved_model_v1" + if keras_format == "saved_model_v1": # pragma: no cover + try: + tf.keras.backend.set_learning_phase(0) + graph_def, input_names, output_names = _get_graph_from_saved_model_v1(model) + except: + raise ValueError("Not supported keras model type...") + # tensorflow 1.x use v1 convert method + else: + tf.keras.backend.set_learning_phase(0) + graph_def, input_names, output_names = _get_graph_from_saved_model_v1(model) shutil.rmtree(temp_dir, True) - return graph_def, input_names, output_names + return graph_def_session(graph_def, input_names, output_names, **kwargs) def keras_session(model, input_tensor_names, output_tensor_names, **kwargs): @@ -748,19 +740,12 @@ def saved_model_session(model, input_tensor_names, output_tensor_names, **kwargs output_tensor_names (list of string): validated output_tensor_names. """ try: - graph_def, input_names, output_names = _get_graph_from_saved_model_v3( + graph_def, input_names, output_names = _get_graph_from_saved_model_v2( model, input_tensor_names, output_tensor_names ) except: - try: - graph_def, input_names, output_names = _get_graph_from_saved_model_v2( - model, input_tensor_names, output_tensor_names - ) - except: - graph_def, input_names, output_names = _get_graph_from_saved_model_v1(model) - + graph_def, input_names, output_names = _get_graph_from_saved_model_v1(model) assert graph_def is not None, "Can not parse the saved model..." - return graph_def_session(graph_def, input_names, output_names, **kwargs) @@ -1223,6 +1208,7 @@ def graph_def(self): def graph_def(self, graph_def): """Set graph definition.""" self._graph_def = graph_def + self.adjust_weight(self.graph_def) # the attributes of some nodes can't be correctly read if don't import the graph_def tf.import_graph_def(self._graph_def, name="") @@ -1254,7 +1240,6 @@ def sq_weight_scale_dict(self): """Return dict of weight scaler for smooth quantization.""" if not self._sq_weight_scale_dict: # pragma: no cover self._sq_weight_scale_dict = self.kwargs.get("sq_weight_scale_dict", None) - assert self._weight_name_mapping is not None, "sq_weight_scale_dict should not be None!" return self._sq_weight_scale_dict @sq_weight_scale_dict.setter @@ -1314,9 +1299,22 @@ def adjust_weight(self, graph_def): from neural_compressor.tensorflow.quantization.utils.utility import reconstruct_saved_model + if not self.model_path: + self.model_path = DEFAULT_WORKSPACE + self.model_path = os.path.abspath(os.path.expanduser(self.model_path)) + if os.path.exists(self.model_path): + import shutil + + shutil.rmtree(self.model_path) + os.makedirs(self.model_path, exist_ok=True) + reconstruct_saved_model(graph_def, self.func, self.frozen_func, self._saved_model, self.model_path) model = load.load(self.model_path, [tag_constants.SERVING]) + if not self._sq_weight_scale_dict: + self._auto_trackable = model + return + for idx, weight_tensor in enumerate(model.variables): parsed_weight_name = self.weight_name_mapping(weight_tensor.name) if parsed_weight_name in self.sq_weight_scale_dict: @@ -1348,14 +1346,132 @@ def save(self, root=None): shutil.rmtree(root) os.makedirs(root, exist_ok=True) - self.adjust_weight(self._graph_def) - graph_def, _saved_model, func, frozen_func, _, _ = parse_saved_model(self._auto_trackable) + if self.sq_weight_scale_dict: + self.adjust_weight(self._graph_def) + graph_def, _saved_model, func, frozen_func, _, _ = parse_saved_model(self.model) reconstruct_saved_model(graph_def, func, frozen_func, _saved_model, root) logger.info("Save quantized model to {}.".format(root)) # delete the LLM file saved in this temporary path shutil.rmtree(self.model_path, ignore_errors=True) +class TensorflowSubclassedKerasModel(TensorflowSavedModelModel): + """Build a subclassed Keras model.""" + + def __init__(self, model="", **kwargs): + """Initialize a subclassed Keras model. + + Args: + model (string or tf.keras.Model object): model path or model object. + """ + super(TensorflowSubclassedKerasModel, self).__init__(model) + self.model_type = "saved_model" + self._keras_model = None + + def _build_as_functional_model(self, model_path): + breakpoint() + TFSMlayer = tf.keras.layers.TFSMLayer(model_path, call_endpoint="serving_default") + inputs = tf.keras.Input(shape=(3, 224, 224)) + outputs = TFSMlayer(inputs) + return tf.keras.Model(inputs, outputs) + + @property + def model(self): + """Return model in Keras Functional object.""" + if self._keras_model: + return self._keras_model + + root = DEFAULT_WORKSPACE + "/saved_model" + root = os.path.abspath(os.path.expanduser(root)) + if os.path.exists(root): + shutil.rmtree(root) + os.makedirs(root, exist_ok=True) + if not self._sess: + self._load_sess(self._model, **self.kwargs) + _, builder = self.build_saved_model(root) + builder.save() + self._keras_model = self._build_as_functional_model(root) + shutil.rmtree(root) + + return self._keras_model + + @model.setter + def model(self, q_model): + """Set model itself.""" + self._keras_model = q_model + + def save(self, root=None): + """Save Tensorflow QAT model.""" + if not root: + root = DEFAULT_WORKSPACE + "/keras_model.keras" + root = os.path.abspath(os.path.expanduser(root)) + os.makedirs(os.path.dirname(root), exist_ok=True) + + self.model.save(root) + return root + + +class TensorflowQATModel(TensorflowSavedModelModel): + """Build Tensorflow QAT model.""" + + def __init__(self, model="", **kwargs): + """Initialize a Tensorflow QAT model. + + Args: + model (string or tf.keras.Model object): model path or model object. + """ + assert isinstance(model, tf.keras.Model) or isinstance( + model, str + ), "The TensorflowQATModel should be initialized either by a string or a tf.keras.Model." + super(TensorflowQATModel, self).__init__(model) + self.keras_model = None + self.model_type = "keras" + + @property + def model(self): + """Return model itself.""" + if self.keras_model is None: + if isinstance(self._model, tf.keras.Model): + self.keras_model = self._model + else: + self.keras_model = tf.keras.models.load_model(self._model) + + return self.keras_model + + @model.setter + def model(self, q_model): + """Set model itself.""" + self.keras_model = q_model + + @property + def frozen_graph_def(self): + """Get frozen graph_def.""" + graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( + self.sess, self.sess.graph_def, self.output_node_names + ) + return graph_def + + def save(self, root=None): + """Save Tensorflow QAT model.""" + if not root: + root = DEFAULT_WORKSPACE + "/saved_model" + root = os.path.abspath(os.path.expanduser(root)) + os.makedirs(os.path.dirname(root), exist_ok=True) + if root.endswith(".pb"): + saved_format = "pb file" + graph_def = self.frozen_graph_def + f = tf.io.gfile.GFile(root, "wb") + f.write(graph_def.SerializeToString()) + else: + q_aware_model = self.keras_model + q_aware_model.save(root) + saved_format = "saved_model" + if root.endswith(".h5"): + saved_format = "h5 file" + logger.info("Save quantized model to {}.".format(saved_format)) + return root + + class TensorflowCheckpointModel(TensorflowBaseModel): """Build Tensorflow checkpoint model.""" @@ -1440,24 +1556,12 @@ def save(self, root, *args, **kwargs): @property def input_node_names(self): """Return input node names.""" - names = ( - self.model.input_names - if version1_lt_version2(tf.version.VERSION, "2.16.1") - else [tensor.name for tensor in self.model.inputs] - ) - - return names + return self.model.input_names @property def output_node_names(self): """Return output node names.""" - names = ( - self.model.output_names - if version1_lt_version2(tf.version.VERSION, "2.16.1") - else [tensor.name for tensor in self.model.outputs] - ) - - return names + return self.model.output_names TENSORFLOW_MODELS = { diff --git a/test/requirements.txt b/test/requirements.txt index 0c117db3d86..37b99f5aac2 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -11,6 +11,7 @@ onnxruntime onnxruntime-extensions; python_version < '3.11' optimum<=1.24.0 peft<=0.14.0 +pydantic tensorflow-addons<=0.23.0 tf2onnx<=1.16.1 tf_slim<=1.1.0