| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import warnings |
| from argparse import ArgumentParser |
| from os import listdir, makedirs |
| from pathlib import Path |
| from typing import Dict, List, Optional, Tuple |
|
|
| from packaging.version import Version, parse |
|
|
| from transformers.pipelines import Pipeline, pipeline |
| from transformers.tokenization_utils import BatchEncoding |
| from transformers.utils import ModelOutput, is_tf_available, is_torch_available |
|
|
|
|
| |
| |
| ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") |
|
|
|
|
| SUPPORTED_PIPELINES = [ |
| "feature-extraction", |
| "ner", |
| "sentiment-analysis", |
| "fill-mask", |
| "question-answering", |
| "text-generation", |
| "translation_en_to_fr", |
| "translation_en_to_de", |
| "translation_en_to_ro", |
| ] |
|
|
|
|
| class OnnxConverterArgumentParser(ArgumentParser): |
| """ |
| Wraps all the script arguments supported to export transformers models to ONNX IR |
| """ |
|
|
| def __init__(self): |
| super().__init__("ONNX Converter") |
|
|
| self.add_argument( |
| "--pipeline", |
| type=str, |
| choices=SUPPORTED_PIPELINES, |
| default="feature-extraction", |
| ) |
| self.add_argument( |
| "--model", |
| type=str, |
| required=True, |
| help="Model's id or path (ex: google-bert/bert-base-cased)", |
| ) |
| self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: google-bert/bert-base-cased)") |
| self.add_argument( |
| "--framework", |
| type=str, |
| choices=["pt", "tf"], |
| help="Framework for loading the model", |
| ) |
| self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") |
| self.add_argument( |
| "--check-loading", |
| action="store_true", |
| help="Check ONNX is able to load the model", |
| ) |
| self.add_argument( |
| "--use-external-format", |
| action="store_true", |
| help="Allow exporting model >= than 2Gb", |
| ) |
| self.add_argument( |
| "--quantize", |
| action="store_true", |
| help="Quantize the neural network to be run with int8", |
| ) |
| self.add_argument("output") |
|
|
|
|
| def generate_identified_filename(filename: Path, identifier: str) -> Path: |
| """ |
| Append a string-identifier at the end (before the extension, if any) to the provided filepath |
| |
| Args: |
| filename: pathlib.Path The actual path object we would like to add an identifier suffix |
| identifier: The suffix to add |
| |
| Returns: String with concatenated identifier at the end of the filename |
| """ |
| return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) |
|
|
|
|
| def check_onnxruntime_requirements(minimum_version: Version): |
| """ |
| Check onnxruntime is installed and if the installed version match is recent enough |
| |
| Raises: |
| ImportError: If onnxruntime is not installed or too old version is found |
| """ |
| try: |
| import onnxruntime |
|
|
| |
| ort_version = parse(onnxruntime.__version__) |
|
|
| |
| if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: |
| raise ImportError( |
| f"We found an older version of onnxruntime ({onnxruntime.__version__}) " |
| f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" |
| "Please update onnxruntime by running `pip install --upgrade onnxruntime`" |
| ) |
|
|
| except ImportError: |
| raise ImportError( |
| "onnxruntime doesn't seem to be currently installed. " |
| "Please install the onnxruntime by running `pip install onnxruntime`" |
| " and relaunch the conversion." |
| ) |
|
|
|
|
| def ensure_valid_input(model, tokens, input_names): |
| """ |
| Ensure inputs are presented in the correct order, without any Non |
| |
| Args: |
| model: The model used to forward the input data |
| tokens: BatchEncoding holding the input data |
| input_names: The name of the inputs |
| |
| Returns: Tuple |
| |
| """ |
| print("Ensuring inputs are in correct order") |
|
|
| model_args_name = model.forward.__code__.co_varnames |
| model_args, ordered_input_names = [], [] |
| for arg_name in model_args_name[1:]: |
| if arg_name in input_names: |
| ordered_input_names.append(arg_name) |
| model_args.append(tokens[arg_name]) |
| else: |
| print(f"{arg_name} is not present in the generated input list.") |
| break |
|
|
| print(f"Generated inputs order: {ordered_input_names}") |
| return ordered_input_names, tuple(model_args) |
|
|
|
|
| def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: |
| """ |
| Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model |
| |
| Args: |
| nlp: The pipeline object holding the model to be exported |
| framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) |
| |
| Returns: |
| |
| - List of the inferred input variable names |
| - List of the inferred output variable names |
| - Dictionary with input/output variables names as key and shape tensor as value |
| - a BatchEncoding reference which was used to infer all the above information |
| """ |
|
|
| def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): |
| if isinstance(tensor, (tuple, list)): |
| return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] |
|
|
| else: |
| |
| axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} |
| if is_input: |
| if len(tensor.shape) == 2: |
| axes[1] = "sequence" |
| else: |
| raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") |
| else: |
| seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] |
| axes.update({dim: "sequence" for dim in seq_axes}) |
|
|
| print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") |
| return axes |
|
|
| tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) |
| seq_len = tokens.input_ids.shape[-1] |
| outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) |
| if isinstance(outputs, ModelOutput): |
| outputs = outputs.to_tuple() |
| if not isinstance(outputs, (list, tuple)): |
| outputs = (outputs,) |
|
|
| |
| input_vars = list(tokens.keys()) |
| input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} |
|
|
| |
| outputs_flat = [] |
| for output in outputs: |
| if isinstance(output, (tuple, list)): |
| outputs_flat.extend(output) |
| else: |
| outputs_flat.append(output) |
|
|
| |
| output_names = [f"output_{i}" for i in range(len(outputs_flat))] |
| output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} |
|
|
| |
| dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) |
| return input_vars, output_names, dynamic_axes, tokens |
|
|
|
|
| def load_graph_from_args( |
| pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs |
| ) -> Pipeline: |
| """ |
| Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model |
| |
| Args: |
| pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) |
| framework: The actual model to convert the pipeline from ("pt" or "tf") |
| model: The model name which will be loaded by the pipeline |
| tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value |
| |
| Returns: Pipeline object |
| |
| """ |
| |
| if tokenizer is None: |
| tokenizer = model |
|
|
| |
| if framework == "pt" and not is_torch_available(): |
| raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") |
| if framework == "tf" and not is_tf_available(): |
| raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") |
|
|
| print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") |
|
|
| |
| return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) |
|
|
|
|
| def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): |
| """ |
| Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR |
| |
| Args: |
| nlp: The pipeline to be exported |
| opset: The actual version of the ONNX operator set to use |
| output: Path where will be stored the generated ONNX model |
| use_external_format: Split the model definition from its parameters to allow model bigger than 2GB |
| |
| Returns: |
| |
| """ |
| if not is_torch_available(): |
| raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") |
|
|
| import torch |
| from torch.onnx import export |
|
|
| print(f"Using framework PyTorch: {torch.__version__}") |
|
|
| with torch.no_grad(): |
| input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") |
| ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) |
|
|
| export( |
| nlp.model, |
| model_args, |
| f=output.as_posix(), |
| input_names=ordered_input_names, |
| output_names=output_names, |
| dynamic_axes=dynamic_axes, |
| do_constant_folding=True, |
| opset_version=opset, |
| ) |
|
|
|
|
| def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): |
| """ |
| Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR) |
| |
| Args: |
| nlp: The pipeline to be exported |
| opset: The actual version of the ONNX operator set to use |
| output: Path where will be stored the generated ONNX model |
| |
| Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow |
| |
| """ |
| if not is_tf_available(): |
| raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") |
|
|
| print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") |
|
|
| try: |
| import tensorflow as tf |
| import tf2onnx |
| from tf2onnx import __version__ as t2ov |
|
|
| print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}") |
|
|
| |
| input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") |
|
|
| |
| nlp.model.predict(tokens.data) |
| input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()] |
| model_proto, _ = tf2onnx.convert.from_keras( |
| nlp.model, input_signature, opset=opset, output_path=output.as_posix() |
| ) |
|
|
| except ImportError as e: |
| raise Exception( |
| f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}" |
| ) |
|
|
|
|
| def convert( |
| framework: str, |
| model: str, |
| output: Path, |
| opset: int, |
| tokenizer: Optional[str] = None, |
| use_external_format: bool = False, |
| pipeline_name: str = "feature-extraction", |
| **model_kwargs, |
| ): |
| """ |
| Convert the pipeline object to the ONNX Intermediate Representation (IR) format |
| |
| Args: |
| framework: The framework the pipeline is backed by ("pt" or "tf") |
| model: The name of the model to load for the pipeline |
| output: The path where the ONNX graph will be stored |
| opset: The actual version of the ONNX operator set to use |
| tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided |
| use_external_format: |
| Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) |
| pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) |
| model_kwargs: Keyword arguments to be forwarded to the model constructor |
| |
| Returns: |
| |
| """ |
| warnings.warn( |
| "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of" |
| " Transformers", |
| FutureWarning, |
| ) |
| print(f"ONNX opset version set to: {opset}") |
|
|
| |
| nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) |
|
|
| if not output.parent.exists(): |
| print(f"Creating folder {output.parent}") |
| makedirs(output.parent.as_posix()) |
| elif len(listdir(output.parent.as_posix())) > 0: |
| raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") |
|
|
| |
| if framework == "pt": |
| convert_pytorch(nlp, opset, output, use_external_format) |
| else: |
| convert_tensorflow(nlp, opset, output) |
|
|
|
|
| def optimize(onnx_model_path: Path) -> Path: |
| """ |
| Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the |
| optimizations possible |
| |
| Args: |
| onnx_model_path: filepath where the model binary description is stored |
| |
| Returns: Path where the optimized model binary description has been saved |
| |
| """ |
| from onnxruntime import InferenceSession, SessionOptions |
|
|
| |
| opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") |
| sess_option = SessionOptions() |
| sess_option.optimized_model_filepath = opt_model_path.as_posix() |
| _ = InferenceSession(onnx_model_path.as_posix(), sess_option) |
|
|
| print(f"Optimized model has been written at {opt_model_path}: \N{HEAVY CHECK MARK}") |
| print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") |
|
|
| return opt_model_path |
|
|
|
|
| def quantize(onnx_model_path: Path) -> Path: |
| """ |
| Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU |
| |
| Args: |
| onnx_model_path: Path to location the exported ONNX model is stored |
| |
| Returns: The Path generated for the quantized |
| """ |
| import onnx |
| import onnxruntime |
| from onnx.onnx_pb import ModelProto |
| from onnxruntime.quantization import QuantizationMode |
| from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer |
| from onnxruntime.quantization.registry import IntegerOpsRegistry |
|
|
| |
| onnx_model = onnx.load(onnx_model_path.as_posix()) |
|
|
| if parse(onnx.__version__) < parse("1.5.0"): |
| print( |
| "Models larger than 2GB will fail to quantize due to protobuf constraint.\n" |
| "Please upgrade to onnxruntime >= 1.5.0." |
| ) |
|
|
| |
| copy_model = ModelProto() |
| copy_model.CopyFrom(onnx_model) |
|
|
| |
| |
| |
| |
| if parse(onnxruntime.__version__) < parse("1.13.1"): |
| quantizer = ONNXQuantizer( |
| model=copy_model, |
| per_channel=False, |
| reduce_range=False, |
| mode=QuantizationMode.IntegerOps, |
| static=False, |
| weight_qType=True, |
| input_qType=False, |
| tensors_range=None, |
| nodes_to_quantize=None, |
| nodes_to_exclude=None, |
| op_types_to_quantize=list(IntegerOpsRegistry), |
| ) |
| else: |
| quantizer = ONNXQuantizer( |
| model=copy_model, |
| per_channel=False, |
| reduce_range=False, |
| mode=QuantizationMode.IntegerOps, |
| static=False, |
| weight_qType=True, |
| activation_qType=False, |
| tensors_range=None, |
| nodes_to_quantize=None, |
| nodes_to_exclude=None, |
| op_types_to_quantize=list(IntegerOpsRegistry), |
| ) |
|
|
| |
| quantizer.quantize_model() |
|
|
| |
| quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") |
|
|
| |
| print(f"Quantized model has been written at {quantized_model_path}: \N{HEAVY CHECK MARK}") |
| onnx.save_model(quantizer.model.model, quantized_model_path.as_posix()) |
|
|
| return quantized_model_path |
|
|
|
|
| def verify(path: Path): |
| from onnxruntime import InferenceSession, SessionOptions |
| from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException |
|
|
| print(f"Checking ONNX model loading from: {path} ...") |
| try: |
| onnx_options = SessionOptions() |
| _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) |
| print(f"Model {path} correctly loaded: \N{HEAVY CHECK MARK}") |
| except RuntimeException as re: |
| print(f"Error while loading the model {re}: \N{HEAVY BALLOT X}") |
|
|
|
|
| if __name__ == "__main__": |
| parser = OnnxConverterArgumentParser() |
| args = parser.parse_args() |
|
|
| |
| args.output = Path(args.output).absolute() |
|
|
| try: |
| print("\n====== Converting model to ONNX ======") |
| |
| convert( |
| args.framework, |
| args.model, |
| args.output, |
| args.opset, |
| args.tokenizer, |
| args.use_external_format, |
| args.pipeline, |
| ) |
|
|
| if args.quantize: |
| |
| check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) |
|
|
| |
| if args.framework == "tf": |
| print( |
| "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" |
| "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" |
| "\t For more information, please refer to the onnxruntime documentation:\n" |
| "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" |
| ) |
|
|
| print("\n====== Optimizing ONNX model ======") |
|
|
| |
| args.optimized_output = optimize(args.output) |
|
|
| |
| args.quantized_output = quantize(args.optimized_output) |
|
|
| |
| if args.check_loading: |
| print("\n====== Check exported ONNX model(s) ======") |
| verify(args.output) |
|
|
| if hasattr(args, "optimized_output"): |
| verify(args.optimized_output) |
|
|
| if hasattr(args, "quantized_output"): |
| verify(args.quantized_output) |
|
|
| except Exception as e: |
| print(f"Error while converting the model: {e}") |
| exit(1) |
|
|