Merge branch 'huggingface:main' into main

jiqing-feng · Apr 22, 2024 · c54642c · c54642c
2 parents 9c96364 + 673b88b
commit c54642c
Show file tree

Hide file tree

Showing 46 changed files with 2,191 additions and 788 deletions.
diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
@@ -32,11 +32,17 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-pytorch==2.1.100
-        pip install intel-extension-for-transformers==1.3.2
+        pip install intel-extension-for-transformers
         pip install peft
+
     - name: Test with Pytest
       run: |
-        pytest tests/neural_compressor/
+        pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
+    - name: Test IPEX
+      run: |
+        pip uninstall -y intel-extension-for-transformers
+        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.1.100
+        pytest tests/neural_compressor/test_ipex.py
+
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -35,7 +35,11 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic
+        pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
+    - name: Test basic
+      run: |
+        pip uninstall -y nncf
+        pytest tests/openvino/test_modeling_basic.py
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino

diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
@@ -25,7 +25,7 @@ jobs:
         # Testing lower and upper bound of supported Python versions
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.11"]
-        transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git']
+        transformers: ['transformers']
         optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']
 
     runs-on: ubuntu-20.04
@@ -42,7 +42,7 @@ jobs:
         # Install openvino manually to prevent dependency conflicts when .[openvino] pins
         # optimum or transformers to a specific version
         # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
         pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }}
 
     - name: Pip freeze        
@@ -51,4 +51,4 @@ jobs:
     - name: Test with Pytest
       run: |
         pytest tests/openvino/test_modeling_basic.py
-
+        RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ build_doc_docker_image:
 doc: build_doc_docker_image
 	@test -n "$(BUILD_DIR)" || (echo "BUILD_DIR is empty." ; exit 1)
 	@test -n "$(VERSION)" || (echo "VERSION is empty." ; exit 1)
-	docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder doc_maker \
+	docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder --env CI=$(CI) doc_maker \
 	doc-builder build optimum.intel /optimum-intel/docs/source/ \
 		--repo_name optimum-intel \
 		--build_dir $(BUILD_DIR) \

diff --git a/README.md b/README.md
@@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi
 
 | Accelerator                                                                                                      | Installation                                                         |
 |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|
-| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`  |
-| [OpenVINO](https://docs.openvino.ai)                                                                             | `pip install --upgrade-strategy eager "optimum[openvino]"`           |
-| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade-strategy eager "optimum[ipex]"`               |
+| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`  |
+| [OpenVINO](https://docs.openvino.ai)                                                                             | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"`           |
+| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"`               |
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
 
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
+Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
+
+```plain
+optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
+```
+
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
@@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher
 
 ```python
 from functools import partial
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
 # The directory where the quantized model will be saved
 save_dir = "nncf_results"
 # Apply static quantization and save the resulting model in the OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Load the quantized model
 optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 ```

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -18,10 +18,10 @@ limitations under the License.
 
 To install the latest release of 🤗 Optimum Intel with the corresponding required dependencies, you can do respectively:
 
-| Accelerator                                                                                                            | Installation                                                       |
-|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------|
-| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`|
-| [Intel OpenVINO](https://docs.openvino.ai )                                                                            | `pip install --upgrade-strategy eager "optimum[openvino]"`    |
+| Accelerator                                                                                                            | Installation                                                                 |
+|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------|
+| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`|
+| [Intel OpenVINO](https://docs.openvino.ai )                                                                            | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"`         |
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
 

diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx
@@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o
 
 ```python
 from transformers import AutoTokenizer
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
 model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
@@ -95,7 +95,8 @@ save_dir = "ptq_model"
 quantizer = OVQuantizer.from_pretrained(model)
 
 # Apply static quantization and export the resulting quantized model to OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Save the tokenizer
 tokenizer.save_pretrained(save_dir)
 ```

diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py
@@ -64,8 +64,7 @@
 
 
 if is_intel_extension_for_transformers_available():
-    from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
-
+    from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
@@ -227,8 +226,9 @@ class OptimizationArguments:
         metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
     )
     quantization_methodology: str = field(
-        default="RTN",
-        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
+        choices=["rtn", "gptq"],
+        default="rtn",
+        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
     )
     damp_percent: float = field(
         default=0.01,
@@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
                     raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
                 if optim_args.apply_pruning or optim_args.apply_distillation:
                     raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
-                if optim_args.quantization_methodology == "GPTQ":
-                    algorithm_args = {
-                        "act_order": False,
-                        "percdamp": optim_args.damp_percent,
-                        "block_size": optim_args.gptq_block_size,
-                        "nsamples": optim_args.num_calibration_samples,
-                        "use_max_length": optim_args.use_max_length,
-                        "pad_max_length": optim_args.pad_max_length,
-                    }
-                quantization_config = WeightOnlyQuantConfig(
-                    weight_dtype=optim_args.weight_dtype,
-                    group_size=optim_args.group_size,
-                    scheme=optim_args.weight_only_scheme,
-                    algorithm=optim_args.quantization_methodology,
-                    algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
-                )
+
+                algorithm_args = {
+                    "weight_dtype": optim_args.weight_dtype,
+                    "sym": optim_args.weight_only_scheme == "sym",
+                    "group_size": optim_args.group_size,
+                }
+
+                if optim_args.quantization_methodology == "gptq":
+                    quantization_config = GPTQConfig(
+                        damp_percent=optim_args.damp_percent,
+                        nsamples=optim_args.num_calibration_samples,
+                        blocksize=optim_args.gptq_block_size,
+                        **algorithm_args,
+                    )
+                else:
+                    quantization_config = RtnConfig(**algorithm_args)
+
             else:
                 quantization_config = PostTrainingQuantConfig(
                     approach=optim_args.quantization_approach, recipes=recipes