Skip to content

Commit

Permalink
Merge branch 'huggingface:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
jiqing-feng authored Apr 22, 2024
2 parents 9c96364 + 673b88b commit c54642c
Show file tree
Hide file tree
Showing 46 changed files with 2,191 additions and 788 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,17 @@ jobs:
python -m pip install --upgrade pip
pip install cmake
pip install py-cpuinfo
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-pytorch==2.1.100
pip install intel-extension-for-transformers==1.3.2
pip install intel-extension-for-transformers
pip install peft
- name: Test with Pytest
run: |
pytest tests/neural_compressor/
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
- name: Test IPEX
run: |
pip uninstall -y intel-extension-for-transformers
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install intel-extension-for-pytorch==2.1.100
pytest tests/neural_compressor/test_ipex.py
6 changes: 5 additions & 1 deletion .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ jobs:
pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
- name: Test with Pytest
run: |
pytest tests/openvino/ --ignore test_modeling_basic
pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
- name: Test basic
run: |
pip uninstall -y nncf
pytest tests/openvino/test_modeling_basic.py
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test_openvino_basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
# Testing lower and upper bound of supported Python versions
# This also ensures that the test fails if dependencies break for Python 3.7
python-version: ["3.8", "3.11"]
transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git']
transformers: ['transformers']
optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']

runs-on: ubuntu-20.04
Expand All @@ -42,7 +42,7 @@ jobs:
# Install openvino manually to prevent dependency conflicts when .[openvino] pins
# optimum or transformers to a specific version
# Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }}
- name: Pip freeze
Expand All @@ -51,4 +51,4 @@ jobs:
- name: Test with Pytest
run: |
pytest tests/openvino/test_modeling_basic.py
RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ build_doc_docker_image:
doc: build_doc_docker_image
@test -n "$(BUILD_DIR)" || (echo "BUILD_DIR is empty." ; exit 1)
@test -n "$(VERSION)" || (echo "VERSION is empty." ; exit 1)
docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder doc_maker \
docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder --env CI=$(CI) doc_maker \
doc-builder build optimum.intel /optimum-intel/docs/source/ \
--repo_name optimum-intel \
--build_dir $(BUILD_DIR) \
Expand Down
19 changes: 13 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi

| Accelerator | Installation |
|:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|
| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` |
| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade-strategy eager "optimum[openvino]"` |
| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` |
| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"` |
| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` |
| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"` |

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.

Expand Down Expand Up @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
optimum-cli export openvino --model gpt2 ov_model
```

You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.

```plain
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
```

Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.

```plain
optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
```

To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).

#### Inference:
Expand Down Expand Up @@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher

```python
from functools import partial
from optimum.intel import OVQuantizer, OVModelForSequenceClassification
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
Expand All @@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
# The directory where the quantized model will be saved
save_dir = "nncf_results"
# Apply static quantization and save the resulting model in the OpenVINO IR format
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
# Load the quantized model
optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
```
Expand Down
8 changes: 4 additions & 4 deletions docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ limitations under the License.

To install the latest release of 🤗 Optimum Intel with the corresponding required dependencies, you can do respectively:

| Accelerator | Installation |
|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------|
| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`|
| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade-strategy eager "optimum[openvino]"` |
| Accelerator | Installation |
|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------|
| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`|
| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` |

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.

Expand Down
5 changes: 3 additions & 2 deletions docs/source/optimization_ov.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o

```python
from transformers import AutoTokenizer
from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
Expand All @@ -95,7 +95,8 @@ save_dir = "ptq_model"
quantizer = OVQuantizer.from_pretrained(model)

# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)
```
Expand Down
41 changes: 21 additions & 20 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""

Expand Down Expand Up @@ -227,8 +226,9 @@ class OptimizationArguments:
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
)
quantization_methodology: str = field(
default="RTN",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
choices=["rtn", "gptq"],
default="rtn",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
)
damp_percent: float = field(
default=0.01,
Expand Down Expand Up @@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
if optim_args.quantization_methodology == "GPTQ":
algorithm_args = {
"act_order": False,
"percdamp": optim_args.damp_percent,
"block_size": optim_args.gptq_block_size,
"nsamples": optim_args.num_calibration_samples,
"use_max_length": optim_args.use_max_length,
"pad_max_length": optim_args.pad_max_length,
}
quantization_config = WeightOnlyQuantConfig(
weight_dtype=optim_args.weight_dtype,
group_size=optim_args.group_size,
scheme=optim_args.weight_only_scheme,
algorithm=optim_args.quantization_methodology,
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
)

algorithm_args = {
"weight_dtype": optim_args.weight_dtype,
"sym": optim_args.weight_only_scheme == "sym",
"group_size": optim_args.group_size,
}

if optim_args.quantization_methodology == "gptq":
quantization_config = GPTQConfig(
damp_percent=optim_args.damp_percent,
nsamples=optim_args.num_calibration_samples,
blocksize=optim_args.gptq_block_size,
**algorithm_args,
)
else:
quantization_config = RtnConfig(**algorithm_args)

else:
quantization_config = PostTrainingQuantConfig(
approach=optim_args.quantization_approach, recipes=recipes
Expand Down
Loading

0 comments on commit c54642c

Please sign in to comment.